acquisitionset.seed(100)
data(acquisitionRetention)
data1 = acquisitionRetention
index = sample(nrow(data1),0.8*nrow(data1))
train = data1[index,]
test = data1[-index,]
anyNA(data1)
model <- glm(acquisition ~acq_exp + industry + revenue + employees, data = train, family = binomial)
vif(model)
## [1] FALSE
## acq_exp industry revenue employees
## 1.010358 1.100550 1.054522 1.130627
Note that there are no missing values for this data set. After removing variables with perfect separation, we are left with the variables acq_exp, industry, revenue, and employees. As we can see from the VIF output, there are no multicollinearity issues with this model.
acquisitionset.seed(100)
model <- glm(acquisition ~ acq_exp + industry + revenue + employees, data = train, family = binomial)
#cooks distance plot
dev <-residuals(model, type = "deviance")
pea <-residuals(model, type = "pearson")
devstd<-residuals(model, type = "deviance")/sqrt(1 - hatvalues(model)) # standardized deviance residuals
peastd <-residuals(model, type = "pearson")/sqrt(1 - hatvalues(model)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[model$model$acquisition==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[model$model$acquisition==0], col = "blue")
plot(peastd[model$model$acquisition==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[model$model$acquisition==0], col = "blue")
#taking out outliers
cooks_sd_cutoff = 3*sd(cooks.distance(model))
out <- which(cooks.distance(model)>cooks_sd_cutoff)
model <- glm(as.factor(acquisition) ~ acq_exp + industry + revenue + employees, data = train[-out,], family = binomial)
summary(model)
#Maximized cutoff (change predict object)
preds <- prediction(predict(model, test, type = "response"),
test$acquisition)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(preds, measure = "auc")@y.values),3)
false.rates <-performance(preds, "fpr","fnr")
accuracy <-performance(preds, "acc","err")
plot(unlist(performance(preds, "sens")@x.values), unlist(performance(preds, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(preds, "spec")@x.values), unlist(performance(preds, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(preds, "sens")@y.values) - unlist(performance(preds, "spec")@y.values)))
min.x<-unlist(performance(preds, "sens")@x.values)[min.diff]
min.y<-unlist(performance(preds, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(preds, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(model, test, type = "response")
#optcutoff = optimalCutoff(test$acquisition,predict)
predict_choice = ifelse(predict>optimal, 1,0)
caret = caret::confusionMatrix(as.factor(predict_choice),as.factor(test$acquisition), positive = '1')
caret
#diagnostics
cooks_distance = cooks.distance(model)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(model),labels = rownames(train))
exp(coef(model))
LR = Anova(model, test = "LR")
Wald = Anova(model, test = "Wald")
score = anova(model, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
accuracy.list = c()
accuracy.list[1]=caret$overall[1]
##
## Call:
## glm(formula = as.factor(acquisition) ~ acq_exp + industry + revenue +
## employees, family = binomial, data = train[-out, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4253 -0.3946 0.1900 0.5450 2.2486
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.8713978 1.1524949 -7.698 1.39e-14 ***
## acq_exp -0.0004139 0.0008896 -0.465 0.642
## industry 1.8230295 0.3346855 5.447 5.12e-08 ***
## revenue 0.0852951 0.0170349 5.007 5.53e-07 ***
## employees 0.0094531 0.0010891 8.680 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 478.16 on 389 degrees of freedom
## Residual deviance: 271.91 on 385 degrees of freedom
## AIC: 281.91
##
## Number of Fisher Scoring iterations: 6
##
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 29 15
## 1 8 48
##
## Accuracy : 0.77
## 95% CI : (0.6751, 0.8483)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 0.001979
##
## Kappa : 0.5252
##
## Mcnemar's Test P-Value : 0.210903
##
## Sensitivity : 0.7619
## Specificity : 0.7838
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.6591
## Prevalence : 0.6300
## Detection Rate : 0.4800
## Detection Prevalence : 0.5600
## Balanced Accuracy : 0.7728
##
## 'Positive' Class : 1
##
## (Intercept) acq_exp industry revenue employees
## 0.0001403463 0.9995861963 6.1905846993 1.0890383556 1.0094979122
## LR Score Wald
## 1 0.216812 0.01611407 0.2164745
## 2 34.901514 0.01611407 29.6697266
## 3 28.945128 0.01611407 25.0708574
## 4 153.889774 0.01611407 75.3339113
## p.LR p.Score p.Wald
## 1 6.414798e-01 0.8989869 6.417394e-01
## 2 3.468110e-09 0.8989869 5.122864e-08
## 3 7.445784e-08 0.8989869 5.526175e-07
## 4 2.448035e-35 0.8989869 3.974740e-18
Using a cutoff value of 0.7698277 we get an accuracy of 77% with logistic regression.
acquisitionset.seed(100)
# simple DT model
model <- rpart(as.factor(acquisition) ~ acq_exp + industry + revenue + employees, data = train)
# visualize the DT
rattle::fancyRpartPlot(model, sub = "")
# save predictions from DT
preds = predict(model,test,type="class")
caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret
accuracy.list[2]=caret$overall[1]
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 12
## 1 18 51
##
## Accuracy : 0.7
## 95% CI : (0.6002, 0.7876)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 0.08771
##
## Kappa : 0.3342
##
## Mcnemar's Test P-Value : 0.36131
##
## Sensitivity : 0.8095
## Specificity : 0.5135
## Pos Pred Value : 0.7391
## Neg Pred Value : 0.6129
## Prevalence : 0.6300
## Detection Rate : 0.5100
## Detection Prevalence : 0.6900
## Balanced Accuracy : 0.6615
##
## 'Positive' Class : 1
##
We get an accuracy of 70% with decision tree method.
acquisitionset.seed(100)
model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, mtry=4, ntree=1000, importance=T)
#Then we calculate the test error of bagging
preds = predict(model, newdata = test)
caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret
accuracy.list[3]=caret$overall[1]
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20 11
## 1 17 52
##
## Accuracy : 0.72
## 95% CI : (0.6213, 0.8052)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 0.03722
##
## Kappa : 0.3786
##
## Mcnemar's Test P-Value : 0.34470
##
## Sensitivity : 0.8254
## Specificity : 0.5405
## Pos Pred Value : 0.7536
## Neg Pred Value : 0.6452
## Prevalence : 0.6300
## Detection Rate : 0.5200
## Detection Prevalence : 0.6900
## Balanced Accuracy : 0.6830
##
## 'Positive' Class : 1
##
We get an accuracy of 72% with the random forest method.
acquisition1. Tuning Hyperparameters
set.seed(100)
mtry.value = seq(1,4,1)
ntree.values = seq(1e3, 10e3, 1e3)
nodesize.values = seq(2,16,2)
# Create a data frame containing all combinations
hypergrid = expand.grid(mtry = mtry.value, nodesize = nodesize.values, ntree = ntree.values)
# Create an empty vector to store OOB error values
oob_error = c()
# Write a loop over the rows of hypergrid to train the grid of models
for (i in 1:nrow(hypergrid)) {
# Train a Random Forest model with different hyper parameters
model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, importance = T, mtry = hypergrid$mtry[i], nodesize = hypergrid$nodesize[i], ntree = hypergrid$ntree[i])
oob_error[i]= model$err.rate[length(model$err.rate)]
}
opt_i <- which.min(oob_error)
print(hypergrid[opt_i,])
Based on the tuning output, the optimal hyperparameters are: mtry = 1, nodesize = 2, and ntree = 1000.
2. Tuned Random Forest Output
set.seed(7)
model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, mtry=1, nodesize=2, ntree=1000, importance=T)
#Then we calculate the test error of bagging
preds = predict(model, newdata = test)
caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret$overall[1]
accuracy.list[4]=caret$overall[1]
## Accuracy
## 0.78
The tuned Random Forest model gives us an accuracy of 78%.
3. Variable importance
importance(model)
varImpPlot(model)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp 21.21482 19.89358 26.04854 26.761552
## industry 15.31765 15.00624 19.20478 9.036317
## revenue 15.47720 17.12373 21.22813 23.686965
## employees 46.02082 41.63922 49.32958 46.517864
Based on the above plot we can see that employees is our most important variable followed by axq_exp.
4. Checking for interaction terms
set.seed(100)
rfsrc.train = train
rfsrc.train$acquisition = as.factor(rfsrc.train$acquisition)
rfsrc.model <- rfsrc(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = rfsrc.train, mtry=1, nodesize=2, ntree=1000, importance=TRUE)
find.interaction(rfsrc.model,
method = "vimp",
importance = "permute")
## Pairing employees with acq_exp
## Pairing employees with revenue
## Pairing employees with industry
## Pairing acq_exp with revenue
## Pairing acq_exp with industry
## Pairing revenue with industry
##
## Method: vimp
## No. of variables: 4
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 4
## Total no. of paired interactions: 6
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## employees:acq_exp 0.0777 0.0217 0.1047 0.0994 0.0053
## employees:revenue 0.0777 0.0196 0.0935 0.0974 -0.0039
## employees:industry 0.0777 0.0097 0.0904 0.0874 0.0030
## acq_exp:revenue 0.0227 0.0201 0.0436 0.0428 0.0008
## acq_exp:industry 0.0227 0.0113 0.0345 0.0340 0.0004
## revenue:industry 0.0177 0.0128 0.0389 0.0306 0.0084
As we can see from the above table, there are no interaction terms to consider when training a random forest on acquisition.
5. Partial Dependence Plots on acquisition
par(mfrow=c(2,2))
partialPlot(model, pred.data = train, x.var = "acq_exp")
partialPlot(model, pred.data = train, x.var = "industry")
partialPlot(model, pred.data = train, x.var = "revenue")
partialPlot(model, pred.data = train, x.var = "employees")
acquisitionsmethod = c("Logistic Regression", "Decision Tree", "Random Forest", "Tuned Random Forest")
accuracy.df = cbind(Method = method, Accuracy = accuracy.list)
kable(accuracy.df)
| Method | Accuracy |
|---|---|
| Logistic Regression | 0.77 |
| Decision Tree | 0.7 |
| Random Forest | 0.72 |
| Tuned Random Forest | 0.78 |
As we can see from the table, the Tuned Random Forest method offers us the greatest accuracy, but they are all similar.
durationacquisition predictions from our tuned RF model and create a test/train split. We will use this subsetted data to predict duration.data2 = data.frame(acquisitionRetention,prediction=preds)
## Warning in data.frame(acquisitionRetention, prediction = preds): row names were
## found from a short variable and have been discarded
data2=data2[data2$prediction==1 & data2$acquisition==1, ]
index2 = sample(nrow(data2),0.8*nrow(data2))
train2 = data2[index2,]
test2 = data2[-index2,]
Duration (no squared terms)set.seed(100)
model <- lm(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)
vif(model)
model <- lm(duration ~ acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)
vif(model)
preds = predict(model,test2)
mse = mean(((preds-test2$duration)^2))
summary(model)
print(mse)
mse.list = c()
mse.list[1] = mse
## profit acq_exp ret_exp freq crossbuy sow industry revenue
## 38.701384 9.608442 28.458096 1.124796 1.115933 1.099287 1.133198 1.135979
## employees
## 1.368262
## acq_exp ret_exp freq crossbuy sow industry revenue employees
## 1.031667 1.020111 1.020356 1.031486 1.015313 1.017015 1.050198 1.024683
##
## Call:
## lm(formula = duration ~ acq_exp + ret_exp + freq + crossbuy +
## sow + industry + revenue + employees, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -211.433 -18.688 8.916 33.887 81.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 524.723537 30.986843 16.934 <2e-16 ***
## acq_exp 0.006183 0.023496 0.263 0.7927
## ret_exp 1.330547 0.021604 61.587 <2e-16 ***
## freq -7.646533 0.818067 -9.347 <2e-16 ***
## crossbuy 1.930653 1.674683 1.153 0.2504
## sow 0.304886 0.174531 1.747 0.0822 .
## industry -15.867651 7.011198 -2.263 0.0247 *
## revenue -0.347417 0.380044 -0.914 0.3618
## employees -0.031493 0.014661 -2.148 0.0329 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.3 on 195 degrees of freedom
## Multiple R-squared: 0.9536, Adjusted R-squared: 0.9517
## F-statistic: 500.6 on 8 and 195 DF, p-value: < 2.2e-16
##
## [1] 2223.701
Removed profit due to multicollinearity (VIF>10). Our MSE value is 2270.322
Duration (w/ squared terms)set.seed(100)
model <- lm(duration ~ profit + acq_exp + ret_exp + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)
vif(model)
model <- lm(duration ~ profit + acq_exp + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)
vif(model)
model <- lm(duration ~ profit + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)
vif(model)
model <- lm(duration ~ profit + acq_exp_sq + ret_exp_sq + freq + crossbuy + sow + industry + revenue + employees, data = train2)
vif(model)
preds = predict(model,test2)
mse = mean(((preds-test2$duration)^2))
summary(model)
print(mse)
mse.list[2] = mse
## profit acq_exp ret_exp acq_exp_sq ret_exp_sq freq freq_sq
## 107.241209 126.302949 172.069378 54.803282 37.194099 15.066793 17.512475
## crossbuy sow industry revenue employees
## 1.463018 1.387154 1.485038 1.250835 2.074465
## profit acq_exp acq_exp_sq ret_exp_sq freq freq_sq crossbuy
## 15.271309 47.352056 35.801831 10.829825 14.063664 14.402201 1.131152
## sow industry revenue employees
## 1.078643 1.062379 1.082882 1.156823
## profit acq_exp_sq ret_exp_sq freq freq_sq crossbuy sow
## 9.638643 2.810898 7.430200 13.987311 14.212484 1.096648 1.030584
## industry revenue employees
## 1.029216 1.078639 1.088507
## profit acq_exp_sq ret_exp_sq freq crossbuy sow industry
## 9.347543 2.755221 7.221587 1.025570 1.055651 1.030567 1.026849
## revenue employees
## 1.076697 1.087497
##
## Call:
## lm(formula = duration ~ profit + acq_exp_sq + ret_exp_sq + freq +
## crossbuy + sow + industry + revenue + employees, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -136.298 -34.797 -2.168 31.760 207.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.464e+02 4.987e+01 6.946 5.53e-11 ***
## profit 2.880e-01 1.852e-02 15.550 < 2e-16 ***
## acq_exp_sq -6.002e-04 4.738e-05 -12.668 < 2e-16 ***
## ret_exp_sq 2.039e-04 5.794e-05 3.518 0.000541 ***
## freq -5.229e+00 9.945e-01 -5.258 3.83e-07 ***
## crossbuy -2.184e+00 2.054e+00 -1.063 0.289059
## sow -9.613e-02 2.132e-01 -0.451 0.652612
## industry -3.705e+01 8.543e+00 -4.336 2.33e-05 ***
## revenue -1.515e+00 4.666e-01 -3.246 0.001377 **
## employees -1.086e-01 1.832e-02 -5.927 1.39e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.36 on 194 degrees of freedom
## Multiple R-squared: 0.9321, Adjusted R-squared: 0.9289
## F-statistic: 295.8 on 9 and 194 DF, p-value: < 2.2e-16
##
## [1] 3499.421
Note that we removed ret_exp, acq_exp, and freq_sq due to multicollinearity (VIF>10). Our MSE value is 2945.102.
Duration (w squared terms)set.seed(100)
model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, ntree = 1000)
model
preds = predict(model,test2)
preds = preds$predicted
mse = mean(((preds-test2$duration)^2))
print(mse)
mse.list[3] = mse
## Sample size: 204
## Number of trees: 1000
## Forest terminal node size: 5
## Average no. of terminal nodes: 24.872
## No. of variables tried at each split: 4
## Total no. of variables: 12
## Resampling used to grow trees: swor
## Resample size used to grow trees: 129
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## % variance explained: 95.87
## Error rate: 1911.59
##
## [1] 2221.883
duration1. Tuning Hyperparameters for Random Forest on duration
# Establish a list of possible values for hyper-parameters
set.seed(100)
mtry.values <- seq(1,12,1)
nodesize.values <- seq(2,16,2)
ntree.values <- seq(1e3,10e3,1e3)
# Create a data frame containing all combinations
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)
# Create an empty vector to store OOB error values
oob_err <- c()
# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {
# Train a Random Forest model
model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2,mtry = hyper_grid$mtry[i], nodesize = hyper_grid$nodesize[i], ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
Based on the tuning output, the optimal hyperparameters are: mtry = 6, nodesize = 2, and ntree = 5000.
2. Tuned Random Forest Output
set.seed(100)
model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)
model
preds = predict(model,test2)
preds = preds$predicted
mse = mean(((preds-test2$duration)^2))
print(mse)
mse.list[4] = mse
## Sample size: 204
## Number of trees: 5000
## Forest terminal node size: 2
## Average no. of terminal nodes: 63.4598
## No. of variables tried at each split: 6
## Total no. of variables: 12
## Resampling used to grow trees: swor
## Resample size used to grow trees: 129
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## % variance explained: 96.98
## Error rate: 1398.52
##
## [1] 2137.259
As can see from our above output, the tuned Random Forest model gives us an MSE of 1416.456.
3. Variable importance
model$importance # values vary a lot
## profit acq_exp acq_exp_sq ret_exp ret_exp_sq freq
## 1706.027378 30.986036 24.321875 17354.169429 16990.448953 515.569985
## freq_sq crossbuy sow industry revenue employees
## 523.696599 19.705390 -1.551735 4.812085 -22.895137 5.278684
data.frame(importance = model$importance) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "orange", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_nice
From the above plot, we notice that ret_exp and ret_exp_sq are the most important variables in our tuned Random Forest model.
4. Minimal depth
mindepth <- max.subtree(model,
sub.order = TRUE)
# first order depths
print(round(mindepth$order, 3)[,1])
## profit acq_exp acq_exp_sq ret_exp ret_exp_sq freq freq_sq
## 2.486 4.540 4.530 0.884 0.910 3.104 3.081
## crossbuy sow industry revenue employees
## 4.450 4.441 7.489 4.135 4.243
# visualize MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
coord_flip() +
labs(x = "Variables", y = "Minimal Depth")+
theme_nice
# interactions
mindepth$sub.order
## profit acq_exp acq_exp_sq ret_exp ret_exp_sq freq
## profit 0.2294943 0.6090446 0.6142477 0.40356082 0.40609612 0.5474962
## acq_exp 0.7999790 0.4188277 0.8467441 0.74723114 0.74912578 0.8312241
## acq_exp_sq 0.8050061 0.8429420 0.4177796 0.74233234 0.74873208 0.8317926
## ret_exp 0.3061582 0.4306060 0.4298876 0.08125676 0.16274447 0.2723902
## ret_exp_sq 0.3141555 0.4343987 0.4304986 0.16444788 0.08366547 0.2702216
## freq 0.5693441 0.6256223 0.6235810 0.36933062 0.36915118 0.2859555
## freq_sq 0.5614401 0.6219697 0.6308910 0.37443860 0.37470486 0.5875849
## crossbuy 0.7893786 0.8269184 0.8292220 0.72718858 0.72051212 0.7984258
## sow 0.8024670 0.8289974 0.8353687 0.73705723 0.73981044 0.8068809
## industry 0.9476473 0.9533208 0.9543947 0.93765593 0.94268528 0.9545705
## revenue 0.7650184 0.8173407 0.8140385 0.68529002 0.68608704 0.7817239
## employees 0.7817809 0.8165122 0.8196225 0.70354826 0.70183066 0.7904509
## freq_sq crossbuy sow industry revenue employees
## profit 0.5442071 0.6424088 0.6069171 0.8385685 0.5700204 0.5998224
## acq_exp 0.8295500 0.8529680 0.8109189 0.9405908 0.7992987 0.8207199
## acq_exp_sq 0.8282178 0.8536949 0.8156443 0.9411681 0.8073054 0.8168246
## ret_exp 0.2720461 0.4105165 0.4035786 0.7037773 0.3833836 0.3915000
## ret_exp_sq 0.2677335 0.4102806 0.4018677 0.7047763 0.3815175 0.3926795
## freq 0.5900456 0.5775900 0.5530355 0.8133875 0.5319380 0.5546622
## freq_sq 0.2837303 0.5822999 0.5477087 0.8084940 0.5347186 0.5597855
## crossbuy 0.7957865 0.4096734 0.7721201 0.9354499 0.7535056 0.7892283
## sow 0.8057391 0.8303523 0.4094313 0.9193636 0.7695340 0.7986284
## industry 0.9519238 0.9619299 0.9436034 0.6855060 0.9396089 0.9586391
## revenue 0.7799464 0.8077215 0.7460288 0.9187192 0.3812926 0.7642251
## employees 0.7940554 0.8203825 0.7702042 0.9240860 0.7630708 0.3913008
as.matrix(mindepth$sub.order) %>%
reshape2::melt() %>%
data.frame() %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
scale_x_discrete(position = "top") +
geom_tile(color = "white") +
viridis::scale_fill_viridis("Relative min. depth") +
labs(x = "", y = "") +
theme_bw()
# cross-check with vimp
find.interaction(model,
method = "vimp",
importance = "permute")
## Pairing ret_exp with ret_exp_sq
## Pairing ret_exp with profit
## Pairing ret_exp with freq_sq
## Pairing ret_exp with freq
## Pairing ret_exp with acq_exp
## Pairing ret_exp with acq_exp_sq
## Pairing ret_exp with crossbuy
## Pairing ret_exp with employees
## Pairing ret_exp with industry
## Pairing ret_exp with sow
## Pairing ret_exp with revenue
## Pairing ret_exp_sq with profit
## Pairing ret_exp_sq with freq_sq
## Pairing ret_exp_sq with freq
## Pairing ret_exp_sq with acq_exp
## Pairing ret_exp_sq with acq_exp_sq
## Pairing ret_exp_sq with crossbuy
## Pairing ret_exp_sq with employees
## Pairing ret_exp_sq with industry
## Pairing ret_exp_sq with sow
## Pairing ret_exp_sq with revenue
## Pairing profit with freq_sq
## Pairing profit with freq
## Pairing profit with acq_exp
## Pairing profit with acq_exp_sq
## Pairing profit with crossbuy
## Pairing profit with employees
## Pairing profit with industry
## Pairing profit with sow
## Pairing profit with revenue
## Pairing freq_sq with freq
## Pairing freq_sq with acq_exp
## Pairing freq_sq with acq_exp_sq
## Pairing freq_sq with crossbuy
## Pairing freq_sq with employees
## Pairing freq_sq with industry
## Pairing freq_sq with sow
## Pairing freq_sq with revenue
## Pairing freq with acq_exp
## Pairing freq with acq_exp_sq
## Pairing freq with crossbuy
## Pairing freq with employees
## Pairing freq with industry
## Pairing freq with sow
## Pairing freq with revenue
## Pairing acq_exp with acq_exp_sq
## Pairing acq_exp with crossbuy
## Pairing acq_exp with employees
## Pairing acq_exp with industry
## Pairing acq_exp with sow
## Pairing acq_exp with revenue
## Pairing acq_exp_sq with crossbuy
## Pairing acq_exp_sq with employees
## Pairing acq_exp_sq with industry
## Pairing acq_exp_sq with sow
## Pairing acq_exp_sq with revenue
## Pairing crossbuy with employees
## Pairing crossbuy with industry
## Pairing crossbuy with sow
## Pairing crossbuy with revenue
## Pairing employees with industry
## Pairing employees with sow
## Pairing employees with revenue
## Pairing industry with sow
## Pairing industry with revenue
## Pairing sow with revenue
##
## Method: vimp
## No. of variables: 12
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 12
## Total no. of paired interactions: 66
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## ret_exp:ret_exp_sq 17297.3667 17102.6511 46197.8028 34400.0179 11797.7850
## ret_exp:profit 17297.3667 1725.5185 22033.2388 19022.8852 3010.3536
## ret_exp:freq_sq 17297.3667 522.1415 17813.0112 17819.5082 -6.4970
## ret_exp:freq 17297.3667 508.8840 17766.8767 17806.2507 -39.3740
## ret_exp:acq_exp 17297.3667 28.0447 17306.2679 17325.4114 -19.1435
## ret_exp:acq_exp_sq 17297.3667 25.7614 17268.9486 17323.1281 -54.1795
## ret_exp:crossbuy 17297.3667 11.9844 17393.0263 17309.3511 83.6752
## ret_exp:employees 17297.3667 -2.4840 17348.3886 17294.8827 53.5059
## ret_exp:industry 17297.3667 1.8402 17267.9302 17299.2070 -31.2768
## ret_exp:sow 17297.3667 -0.5027 17359.8486 17296.8640 62.9846
## ret_exp:revenue 17297.3667 -15.2566 17183.1959 17282.1102 -98.9143
## ret_exp_sq:profit 16767.5372 1701.7117 21425.1063 18469.2489 2955.8574
## ret_exp_sq:freq_sq 16767.5372 506.6615 17310.2879 17274.1987 36.0892
## ret_exp_sq:freq 16767.5372 517.2377 17307.2498 17284.7749 22.4749
## ret_exp_sq:acq_exp 16767.5372 29.7246 16804.5629 16797.2618 7.3011
## ret_exp_sq:acq_exp_sq 16767.5372 30.2726 16820.8112 16797.8098 23.0014
## ret_exp_sq:crossbuy 16767.5372 14.4920 16744.3016 16782.0292 -37.7276
## ret_exp_sq:employees 16767.5372 1.6745 16831.6354 16769.2116 62.4238
## ret_exp_sq:industry 16767.5372 5.2584 16816.8680 16772.7956 44.0724
## ret_exp_sq:sow 16767.5372 -1.6928 16735.2654 16765.8444 -30.5790
## ret_exp_sq:revenue 16767.5372 -15.7950 16751.1336 16751.7421 -0.6086
## profit:freq_sq 1693.3172 522.6640 2267.9060 2215.9812 51.9248
## profit:freq 1693.3172 500.5254 2210.2603 2193.8427 16.4177
## profit:acq_exp 1693.3172 35.5257 1671.4144 1728.8430 -57.4286
## profit:acq_exp_sq 1693.3172 18.0137 1676.2998 1711.3310 -35.0312
## profit:crossbuy 1693.3172 17.2173 1726.7914 1710.5345 16.2569
## profit:employees 1693.3172 -1.1234 1673.0515 1692.1938 -19.1423
## profit:industry 1693.3172 3.6288 1723.6518 1696.9460 26.7058
## profit:sow 1693.3172 5.8488 1681.0803 1699.1660 -18.0857
## profit:revenue 1693.3172 -13.0462 1678.1313 1680.2710 -2.1396
## freq_sq:freq 514.4935 497.0577 1204.0116 1011.5512 192.4603
## freq_sq:acq_exp 514.4935 27.6496 549.1426 542.1431 6.9995
## freq_sq:acq_exp_sq 514.4935 26.8274 529.5638 541.3210 -11.7572
## freq_sq:crossbuy 514.4935 19.1495 527.4752 533.6430 -6.1678
## freq_sq:employees 514.4935 2.2613 518.7328 516.7548 1.9780
## freq_sq:industry 514.4935 4.7807 514.0019 519.2742 -5.2723
## freq_sq:sow 514.4935 0.7332 529.4787 515.2267 14.2520
## freq_sq:revenue 514.4935 -19.8019 505.9088 494.6916 11.2172
## freq:acq_exp 504.6062 30.9421 546.5592 535.5483 11.0108
## freq:acq_exp_sq 504.6062 30.9866 542.8100 535.5929 7.2171
## freq:crossbuy 504.6062 11.8196 515.0037 516.4258 -1.4221
## freq:employees 504.6062 1.9790 515.5613 506.5852 8.9761
## freq:industry 504.6062 5.1753 515.2461 509.7815 5.4645
## freq:sow 504.6062 1.4570 506.9094 506.0632 0.8462
## freq:revenue 504.6062 -16.0468 480.4935 488.5594 -8.0660
## acq_exp:acq_exp_sq 25.9245 20.5282 45.5938 46.4526 -0.8588
## acq_exp:crossbuy 25.9245 15.9460 42.2767 41.8705 0.4062
## acq_exp:employees 25.9245 -3.4465 21.1403 22.4779 -1.3377
## acq_exp:industry 25.9245 5.7197 23.9137 31.6442 -7.7304
## acq_exp:sow 25.9245 4.4385 26.0649 30.3629 -4.2980
## acq_exp:revenue 25.9245 -14.8110 19.2627 11.1134 8.1492
## acq_exp_sq:crossbuy 34.5877 19.8991 50.9246 54.4868 -3.5622
## acq_exp_sq:employees 34.5877 -6.1188 39.2383 28.4689 10.7694
## acq_exp_sq:industry 34.5877 4.4894 37.7517 39.0771 -1.3254
## acq_exp_sq:sow 34.5877 -2.1146 36.1039 32.4731 3.6308
## acq_exp_sq:revenue 34.5877 -13.3756 18.0898 21.2121 -3.1223
## crossbuy:employees 18.6579 -8.5443 15.5589 10.1136 5.4453
## crossbuy:industry 18.6579 6.2126 25.4707 24.8705 0.6002
## crossbuy:sow 18.6579 3.2085 23.6695 21.8664 1.8032
## crossbuy:revenue 18.6579 -15.2094 6.4384 3.4484 2.9900
## employees:industry -2.7533 3.3074 3.3960 0.5541 2.8419
## employees:sow -2.7533 1.5458 -3.7087 -1.2075 -2.5012
## employees:revenue -2.7533 -25.2063 -29.0859 -27.9596 -1.1262
## industry:sow 5.5231 -4.0597 10.0269 1.4633 8.5636
## industry:revenue 5.5231 -16.7013 -19.3432 -11.1782 -8.1649
## sow:revenue 5.4986 -13.5148 -17.5169 -8.0162 -9.5007
From the above output we see that the minimal depth analysis reaffirms our important variables: ret_exp and ret_exp_sq. From the find.interaction output we see that we may want to include the interactions between the following variables in our model: ret_exp:ret_exp_sq, ret_exp:profit, ret_exp:freq, ret_exp_sq:profit, ret_exp_sq:freq, and ret_exp_sq:revenue.
5. Partial Dependence Plots
model <- randomForest(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)
par(mfrow=c(4,3))
partialPlot(model, pred.data = train2, x.var = "profit")
partialPlot(model, pred.data = train2, x.var = "acq_exp")
partialPlot(model, pred.data = train2, x.var = "acq_exp_sq")
partialPlot(model, pred.data = train2, x.var = "ret_exp")
partialPlot(model, pred.data = train2, x.var = "ret_exp_sq")
partialPlot(model, pred.data = train2, x.var = "freq")
partialPlot(model, pred.data = train2, x.var = "freq_sq")
partialPlot(model, pred.data = train2, x.var = "crossbuy")
partialPlot(model, pred.data = train2, x.var = "sow")
partialPlot(model, pred.data = train2, x.var = "industry")
partialPlot(model, pred.data = train2, x.var = "revenue")
partialPlot(model, pred.data = train2, x.var = "employees")
set.seed(100)
model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees + ret_exp*ret_exp_sq + ret_exp:profit + ret_exp:freq + ret_exp_sq:profit + ret_exp_sq:freq + ret_exp_sq:revenue, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)
model
## Sample size: 204
## Number of trees: 5000
## Forest terminal node size: 2
## Average no. of terminal nodes: 63.4598
## No. of variables tried at each split: 6
## Total no. of variables: 12
## Resampling used to grow trees: swor
## Resample size used to grow trees: 129
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## % variance explained: 96.98
## Error rate: 1398.52
preds = predict(model,test2)
preds = preds$predicted
mse = mean(((preds-test2$duration)^2))
print(mse)
## [1] 2137.259
mse.list[5] = mse
We can see that we get no improvement upon including interaction terms.
Durationmethod2 = c("Linear Regression (no squared terms)", "Linear Regression (squared terms)", "Random Forest", "Tuned Random Forest", "Tuned Random Forest (w interaction terms)")
mse.df = cbind(Method = method2, MSE = mse.list)
kable(mse.df)
| Method | MSE |
|---|---|
| Linear Regression (no squared terms) | 2223.70103310312 |
| Linear Regression (squared terms) | 3499.42139613396 |
| Random Forest | 2221.88348983668 |
| Tuned Random Forest | 2137.25850520586 |
| Tuned Random Forest (w interaction terms) | 2137.25850520585 |
As we can see from the above table, the tuned Random Forest model is the best at predicting duration based on the MSE.