Part One: Predict acquisition

Train Test Split

set.seed(100)
data(acquisitionRetention)
data1 = acquisitionRetention
index = sample(nrow(data1),0.8*nrow(data1))
train = data1[index,]
test = data1[-index,]

Cleaning Data

anyNA(data1)

model <- glm(acquisition ~acq_exp  + industry + revenue + employees, data = train, family = binomial)

vif(model)
## [1] FALSE
##   acq_exp  industry   revenue employees 
##  1.010358  1.100550  1.054522  1.130627

Note that there are no missing values for this data set. After removing variables with perfect separation, we are left with the variables acq_exp, industry, revenue, and employees. As we can see from the VIF output, there are no multicollinearity issues with this model.

Logistic Regression on acquisition

set.seed(100)

model <- glm(acquisition ~ acq_exp + industry + revenue + employees, data = train, family = binomial)

#cooks distance plot

dev <-residuals(model, type = "deviance")
pea <-residuals(model, type = "pearson")
devstd<-residuals(model, type = "deviance")/sqrt(1 - hatvalues(model)) # standardized deviance residuals
peastd <-residuals(model, type = "pearson")/sqrt(1 - hatvalues(model)) # standardized pearson residuals

dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))

plot(devstd[model$model$acquisition==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[model$model$acquisition==0], col = "blue")

plot(peastd[model$model$acquisition==1], col = "red", 
     ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[model$model$acquisition==0], col = "blue")

#taking out outliers

cooks_sd_cutoff = 3*sd(cooks.distance(model))
out <- which(cooks.distance(model)>cooks_sd_cutoff)

model <- glm(as.factor(acquisition) ~ acq_exp  + industry + revenue + employees, data = train[-out,], family = binomial)

summary(model)

#Maximized cutoff (change predict object)
preds <- prediction(predict(model, test, type = "response"),
                   test$acquisition) 

#Predicted Probability and True Classification

auc <- round(as.numeric(performance(preds, measure = "auc")@y.values),3)

false.rates <-performance(preds, "fpr","fnr")
accuracy <-performance(preds, "acc","err")

plot(unlist(performance(preds, "sens")@x.values), unlist(performance(preds, "sens")@y.values), 
     type="l", lwd=2, 
     ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(preds, "spec")@x.values), unlist(performance(preds, "spec")@y.values), 
     type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')

min.diff <-which.min(abs(unlist(performance(preds, "sens")@y.values) - unlist(performance(preds, "spec")@y.values)))
min.x<-unlist(performance(preds, "sens")@x.values)[min.diff]
min.y<-unlist(performance(preds, "spec")@y.values)[min.diff]
optimal <-min.x

abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)

#ROC plot (don't need to change)
perf <- performance(preds, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))

#Confusion Matrix (change predict object)
predict = predict(model, test, type = "response")
#optcutoff = optimalCutoff(test$acquisition,predict)

predict_choice = ifelse(predict>optimal, 1,0)


caret = caret::confusionMatrix(as.factor(predict_choice),as.factor(test$acquisition), positive = '1')
caret

#diagnostics
cooks_distance = cooks.distance(model)
plot(cooks_distance,col="pink", pch=19, cex=1) 
text(cooks.distance(model),labels = rownames(train))

exp(coef(model))

LR = Anova(model, test = "LR")
Wald = Anova(model, test = "Wald")
score = anova(model, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)

accuracy.list = c()
accuracy.list[1]=caret$overall[1]
## 
## Call:
## glm(formula = as.factor(acquisition) ~ acq_exp + industry + revenue + 
##     employees, family = binomial, data = train[-out, ])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4253  -0.3946   0.1900   0.5450   2.2486  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.8713978  1.1524949  -7.698 1.39e-14 ***
## acq_exp     -0.0004139  0.0008896  -0.465    0.642    
## industry     1.8230295  0.3346855   5.447 5.12e-08 ***
## revenue      0.0852951  0.0170349   5.007 5.53e-07 ***
## employees    0.0094531  0.0010891   8.680  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 478.16  on 389  degrees of freedom
## Residual deviance: 271.91  on 385  degrees of freedom
## AIC: 281.91
## 
## Number of Fisher Scoring iterations: 6
## 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 29 15
##          1  8 48
##                                           
##                Accuracy : 0.77            
##                  95% CI : (0.6751, 0.8483)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : 0.001979        
##                                           
##                   Kappa : 0.5252          
##                                           
##  Mcnemar's Test P-Value : 0.210903        
##                                           
##             Sensitivity : 0.7619          
##             Specificity : 0.7838          
##          Pos Pred Value : 0.8571          
##          Neg Pred Value : 0.6591          
##              Prevalence : 0.6300          
##          Detection Rate : 0.4800          
##    Detection Prevalence : 0.5600          
##       Balanced Accuracy : 0.7728          
##                                           
##        'Positive' Class : 1               
##                                           
##  (Intercept)      acq_exp     industry      revenue    employees 
## 0.0001403463 0.9995861963 6.1905846993 1.0890383556 1.0094979122 
##           LR      Score       Wald
## 1   0.216812 0.01611407  0.2164745
## 2  34.901514 0.01611407 29.6697266
## 3  28.945128 0.01611407 25.0708574
## 4 153.889774 0.01611407 75.3339113
##           p.LR   p.Score       p.Wald
## 1 6.414798e-01 0.8989869 6.417394e-01
## 2 3.468110e-09 0.8989869 5.122864e-08
## 3 7.445784e-08 0.8989869 5.526175e-07
## 4 2.448035e-35 0.8989869 3.974740e-18

Using a cutoff value of 0.7698277 we get an accuracy of 77% with logistic regression.

Decision Tree on acquisition

set.seed(100)

# simple DT model
model <- rpart(as.factor(acquisition) ~ acq_exp + industry + revenue + employees, data = train) 

# visualize the DT
rattle::fancyRpartPlot(model, sub = "") 

# save predictions from DT

preds = predict(model,test,type="class")

caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret

accuracy.list[2]=caret$overall[1]
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 19 12
##          1 18 51
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6002, 0.7876)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : 0.08771         
##                                           
##                   Kappa : 0.3342          
##                                           
##  Mcnemar's Test P-Value : 0.36131         
##                                           
##             Sensitivity : 0.8095          
##             Specificity : 0.5135          
##          Pos Pred Value : 0.7391          
##          Neg Pred Value : 0.6129          
##              Prevalence : 0.6300          
##          Detection Rate : 0.5100          
##    Detection Prevalence : 0.6900          
##       Balanced Accuracy : 0.6615          
##                                           
##        'Positive' Class : 1               
## 

We get an accuracy of 70% with decision tree method.

Random Forest on acquisition

set.seed(100)
model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, mtry=4, ntree=1000, importance=T)


#Then we calculate the test error of bagging
preds = predict(model, newdata = test)

caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret

accuracy.list[3]=caret$overall[1]
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 20 11
##          1 17 52
##                                           
##                Accuracy : 0.72            
##                  95% CI : (0.6213, 0.8052)
##     No Information Rate : 0.63            
##     P-Value [Acc > NIR] : 0.03722         
##                                           
##                   Kappa : 0.3786          
##                                           
##  Mcnemar's Test P-Value : 0.34470         
##                                           
##             Sensitivity : 0.8254          
##             Specificity : 0.5405          
##          Pos Pred Value : 0.7536          
##          Neg Pred Value : 0.6452          
##              Prevalence : 0.6300          
##          Detection Rate : 0.5200          
##    Detection Prevalence : 0.6900          
##       Balanced Accuracy : 0.6830          
##                                           
##        'Positive' Class : 1               
## 

We get an accuracy of 72% with the random forest method.

Tuned Random Forest on acquisition

1. Tuning Hyperparameters

set.seed(100)

mtry.value = seq(1,4,1)
ntree.values = seq(1e3, 10e3, 1e3)
nodesize.values = seq(2,16,2)

# Create a data frame containing all combinations 

hypergrid = expand.grid(mtry = mtry.value, nodesize = nodesize.values, ntree = ntree.values)


# Create an empty vector to store OOB error values

oob_error = c()


# Write a loop over the rows of hypergrid to train the grid of models
for (i in 1:nrow(hypergrid)) {
  
# Train a Random Forest model with different hyper parameters
  model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, importance = T, mtry = hypergrid$mtry[i], nodesize = hypergrid$nodesize[i], ntree = hypergrid$ntree[i])
  
  oob_error[i]= model$err.rate[length(model$err.rate)]
  
  
}

opt_i <- which.min(oob_error)
print(hypergrid[opt_i,])

Based on the tuning output, the optimal hyperparameters are: mtry = 1, nodesize = 2, and ntree = 1000.

2. Tuned Random Forest Output

set.seed(7)
model = randomForest(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = train, mtry=1, nodesize=2, ntree=1000, importance=T)

#Then we calculate the test error of bagging
preds = predict(model, newdata = test)

caret = caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
caret$overall[1]

accuracy.list[4]=caret$overall[1]
## Accuracy 
##     0.78

The tuned Random Forest model gives us an accuracy of 78%.

3. Variable importance

importance(model)
varImpPlot(model)

##                  0        1 MeanDecreaseAccuracy MeanDecreaseGini
## acq_exp   21.21482 19.89358             26.04854        26.761552
## industry  15.31765 15.00624             19.20478         9.036317
## revenue   15.47720 17.12373             21.22813        23.686965
## employees 46.02082 41.63922             49.32958        46.517864

Based on the above plot we can see that employees is our most important variable followed by axq_exp.

4. Checking for interaction terms

set.seed(100)
rfsrc.train = train
rfsrc.train$acquisition = as.factor(rfsrc.train$acquisition)
rfsrc.model <- rfsrc(as.factor(acquisition)~ acq_exp + industry + revenue + employees, data = rfsrc.train, mtry=1, nodesize=2, ntree=1000, importance=TRUE)

find.interaction(rfsrc.model,
                      method = "vimp",
                      importance = "permute")
## Pairing employees with acq_exp 
## Pairing employees with revenue 
## Pairing employees with industry 
## Pairing acq_exp with revenue 
## Pairing acq_exp with industry 
## Pairing revenue with industry 
## 
##                               Method: vimp
##                     No. of variables: 4
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 4
##     Total no. of paired interactions: 6
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                     Var 1  Var 2 Paired Additive Difference
## employees:acq_exp  0.0777 0.0217 0.1047   0.0994     0.0053
## employees:revenue  0.0777 0.0196 0.0935   0.0974    -0.0039
## employees:industry 0.0777 0.0097 0.0904   0.0874     0.0030
## acq_exp:revenue    0.0227 0.0201 0.0436   0.0428     0.0008
## acq_exp:industry   0.0227 0.0113 0.0345   0.0340     0.0004
## revenue:industry   0.0177 0.0128 0.0389   0.0306     0.0084

As we can see from the above table, there are no interaction terms to consider when training a random forest on acquisition.

5. Partial Dependence Plots on acquisition

par(mfrow=c(2,2))
partialPlot(model, pred.data = train, x.var = "acq_exp") 
partialPlot(model, pred.data = train, x.var = "industry") 
partialPlot(model, pred.data = train, x.var = "revenue") 
partialPlot(model, pred.data = train, x.var = "employees") 

Model Comparisons for acquisitions

method = c("Logistic Regression", "Decision Tree", "Random Forest", "Tuned Random Forest")
accuracy.df = cbind(Method = method, Accuracy = accuracy.list)
kable(accuracy.df)
Method Accuracy
Logistic Regression 0.77
Decision Tree 0.7
Random Forest 0.72
Tuned Random Forest 0.78

As we can see from the table, the Tuned Random Forest method offers us the greatest accuracy, but they are all similar.

Part Two: Predict duration

Create a subset of the data including only correct acquisition predictions from our tuned RF model and create a test/train split. We will use this subsetted data to predict duration.

data2 = data.frame(acquisitionRetention,prediction=preds)
## Warning in data.frame(acquisitionRetention, prediction = preds): row names were
## found from a short variable and have been discarded
data2=data2[data2$prediction==1 & data2$acquisition==1, ]
index2 = sample(nrow(data2),0.8*nrow(data2))
train2 = data2[index2,]
test2 = data2[-index2,]

Linear Regression on Duration (no squared terms)

set.seed(100)
model <- lm(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)

vif(model)

model <- lm(duration ~ acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)

vif(model)

preds = predict(model,test2)

mse = mean(((preds-test2$duration)^2))

summary(model)

print(mse)

mse.list = c()
mse.list[1] = mse
##    profit   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue 
## 38.701384  9.608442 28.458096  1.124796  1.115933  1.099287  1.133198  1.135979 
## employees 
##  1.368262 
##   acq_exp   ret_exp      freq  crossbuy       sow  industry   revenue employees 
##  1.031667  1.020111  1.020356  1.031486  1.015313  1.017015  1.050198  1.024683 
## 
## Call:
## lm(formula = duration ~ acq_exp + ret_exp + freq + crossbuy + 
##     sow + industry + revenue + employees, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -211.433  -18.688    8.916   33.887   81.384 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 524.723537  30.986843  16.934   <2e-16 ***
## acq_exp       0.006183   0.023496   0.263   0.7927    
## ret_exp       1.330547   0.021604  61.587   <2e-16 ***
## freq         -7.646533   0.818067  -9.347   <2e-16 ***
## crossbuy      1.930653   1.674683   1.153   0.2504    
## sow           0.304886   0.174531   1.747   0.0822 .  
## industry    -15.867651   7.011198  -2.263   0.0247 *  
## revenue      -0.347417   0.380044  -0.914   0.3618    
## employees    -0.031493   0.014661  -2.148   0.0329 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.3 on 195 degrees of freedom
## Multiple R-squared:  0.9536, Adjusted R-squared:  0.9517 
## F-statistic: 500.6 on 8 and 195 DF,  p-value: < 2.2e-16
## 
## [1] 2223.701

Removed profit due to multicollinearity (VIF>10). Our MSE value is 2270.322

Linear Regression on Duration (w/ squared terms)

set.seed(100)
model <- lm(duration ~ profit + acq_exp + ret_exp + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)

vif(model)

model <- lm(duration ~ profit + acq_exp + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)

vif(model)

model <- lm(duration ~ profit + acq_exp_sq + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2)

vif(model)

model <- lm(duration ~ profit + acq_exp_sq + ret_exp_sq + freq + crossbuy + sow + industry + revenue + employees, data = train2)

vif(model)

preds = predict(model,test2)

mse = mean(((preds-test2$duration)^2))

summary(model)

print(mse)

mse.list[2] = mse
##     profit    acq_exp    ret_exp acq_exp_sq ret_exp_sq       freq    freq_sq 
## 107.241209 126.302949 172.069378  54.803282  37.194099  15.066793  17.512475 
##   crossbuy        sow   industry    revenue  employees 
##   1.463018   1.387154   1.485038   1.250835   2.074465 
##     profit    acq_exp acq_exp_sq ret_exp_sq       freq    freq_sq   crossbuy 
##  15.271309  47.352056  35.801831  10.829825  14.063664  14.402201   1.131152 
##        sow   industry    revenue  employees 
##   1.078643   1.062379   1.082882   1.156823 
##     profit acq_exp_sq ret_exp_sq       freq    freq_sq   crossbuy        sow 
##   9.638643   2.810898   7.430200  13.987311  14.212484   1.096648   1.030584 
##   industry    revenue  employees 
##   1.029216   1.078639   1.088507 
##     profit acq_exp_sq ret_exp_sq       freq   crossbuy        sow   industry 
##   9.347543   2.755221   7.221587   1.025570   1.055651   1.030567   1.026849 
##    revenue  employees 
##   1.076697   1.087497 
## 
## Call:
## lm(formula = duration ~ profit + acq_exp_sq + ret_exp_sq + freq + 
##     crossbuy + sow + industry + revenue + employees, data = train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -136.298  -34.797   -2.168   31.760  207.817 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.464e+02  4.987e+01   6.946 5.53e-11 ***
## profit       2.880e-01  1.852e-02  15.550  < 2e-16 ***
## acq_exp_sq  -6.002e-04  4.738e-05 -12.668  < 2e-16 ***
## ret_exp_sq   2.039e-04  5.794e-05   3.518 0.000541 ***
## freq        -5.229e+00  9.945e-01  -5.258 3.83e-07 ***
## crossbuy    -2.184e+00  2.054e+00  -1.063 0.289059    
## sow         -9.613e-02  2.132e-01  -0.451 0.652612    
## industry    -3.705e+01  8.543e+00  -4.336 2.33e-05 ***
## revenue     -1.515e+00  4.666e-01  -3.246 0.001377 ** 
## employees   -1.086e-01  1.832e-02  -5.927 1.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.36 on 194 degrees of freedom
## Multiple R-squared:  0.9321, Adjusted R-squared:  0.9289 
## F-statistic: 295.8 on 9 and 194 DF,  p-value: < 2.2e-16
## 
## [1] 3499.421

Note that we removed ret_exp, acq_exp, and freq_sq due to multicollinearity (VIF>10). Our MSE value is 2945.102.

Random Forest on Duration (w squared terms)

set.seed(100)

model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, ntree = 1000)

model

preds = predict(model,test2)

preds = preds$predicted

mse = mean(((preds-test2$duration)^2))

print(mse)

mse.list[3] = mse
##                          Sample size: 204
##                      Number of trees: 1000
##            Forest terminal node size: 5
##        Average no. of terminal nodes: 24.872
## No. of variables tried at each split: 4
##               Total no. of variables: 12
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 129
##                             Analysis: RF-R
##                               Family: regr
##                       Splitting rule: mse *random*
##        Number of random split points: 10
##                 % variance explained: 95.87
##                           Error rate: 1911.59
## 
## [1] 2221.883

Tuned Random Forest on duration

1. Tuning Hyperparameters for Random Forest on duration

# Establish a list of possible values for hyper-parameters
set.seed(100)
mtry.values <- seq(1,12,1)
nodesize.values <- seq(2,16,2)
ntree.values <- seq(1e3,10e3,1e3)

# Create a data frame containing all combinations 
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)

# Create an empty vector to store OOB error values
oob_err <- c()

# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {

    # Train a Random Forest model
   model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2,mtry = hyper_grid$mtry[i], nodesize = hyper_grid$nodesize[i], ntree = hyper_grid$ntree[i])  
  
                          
    # Store OOB error for the model                      
    oob_err[i] <- model$err.rate[length(model$err.rate)]
}

# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])

Based on the tuning output, the optimal hyperparameters are: mtry = 6, nodesize = 2, and ntree = 5000.

2. Tuned Random Forest Output

set.seed(100)

model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)

model

preds = predict(model,test2)

preds = preds$predicted

mse = mean(((preds-test2$duration)^2))

print(mse)

mse.list[4] = mse
##                          Sample size: 204
##                      Number of trees: 5000
##            Forest terminal node size: 2
##        Average no. of terminal nodes: 63.4598
## No. of variables tried at each split: 6
##               Total no. of variables: 12
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 129
##                             Analysis: RF-R
##                               Family: regr
##                       Splitting rule: mse *random*
##        Number of random split points: 10
##                 % variance explained: 96.98
##                           Error rate: 1398.52
## 
## [1] 2137.259

As can see from our above output, the tuned Random Forest model gives us an MSE of 1416.456.

3. Variable importance

model$importance # values vary a lot
##       profit      acq_exp   acq_exp_sq      ret_exp   ret_exp_sq         freq 
##  1706.027378    30.986036    24.321875 17354.169429 16990.448953   515.569985 
##      freq_sq     crossbuy          sow     industry      revenue    employees 
##   523.696599    19.705390    -1.551735     4.812085   -22.895137     5.278684
data.frame(importance = model$importance) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,importance), y = importance)) +
    geom_bar(stat = "identity", fill = "orange", color = "black")+
    coord_flip() +
     labs(x = "Variables", y = "Variable importance")+
     theme_nice   

From the above plot, we notice that ret_exp and ret_exp_sq are the most important variables in our tuned Random Forest model.

4. Minimal depth

mindepth <- max.subtree(model,
                        sub.order = TRUE)

# first order depths
print(round(mindepth$order, 3)[,1])
##     profit    acq_exp acq_exp_sq    ret_exp ret_exp_sq       freq    freq_sq 
##      2.486      4.540      4.530      0.884      0.910      3.104      3.081 
##   crossbuy        sow   industry    revenue  employees 
##      4.450      4.441      7.489      4.135      4.243
# visualize MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
  tibble::rownames_to_column(var = "variable") %>%
  ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
    geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
    coord_flip() +
     labs(x = "Variables", y = "Minimal Depth")+
     theme_nice

# interactions
mindepth$sub.order
##               profit   acq_exp acq_exp_sq    ret_exp ret_exp_sq      freq
## profit     0.2294943 0.6090446  0.6142477 0.40356082 0.40609612 0.5474962
## acq_exp    0.7999790 0.4188277  0.8467441 0.74723114 0.74912578 0.8312241
## acq_exp_sq 0.8050061 0.8429420  0.4177796 0.74233234 0.74873208 0.8317926
## ret_exp    0.3061582 0.4306060  0.4298876 0.08125676 0.16274447 0.2723902
## ret_exp_sq 0.3141555 0.4343987  0.4304986 0.16444788 0.08366547 0.2702216
## freq       0.5693441 0.6256223  0.6235810 0.36933062 0.36915118 0.2859555
## freq_sq    0.5614401 0.6219697  0.6308910 0.37443860 0.37470486 0.5875849
## crossbuy   0.7893786 0.8269184  0.8292220 0.72718858 0.72051212 0.7984258
## sow        0.8024670 0.8289974  0.8353687 0.73705723 0.73981044 0.8068809
## industry   0.9476473 0.9533208  0.9543947 0.93765593 0.94268528 0.9545705
## revenue    0.7650184 0.8173407  0.8140385 0.68529002 0.68608704 0.7817239
## employees  0.7817809 0.8165122  0.8196225 0.70354826 0.70183066 0.7904509
##              freq_sq  crossbuy       sow  industry   revenue employees
## profit     0.5442071 0.6424088 0.6069171 0.8385685 0.5700204 0.5998224
## acq_exp    0.8295500 0.8529680 0.8109189 0.9405908 0.7992987 0.8207199
## acq_exp_sq 0.8282178 0.8536949 0.8156443 0.9411681 0.8073054 0.8168246
## ret_exp    0.2720461 0.4105165 0.4035786 0.7037773 0.3833836 0.3915000
## ret_exp_sq 0.2677335 0.4102806 0.4018677 0.7047763 0.3815175 0.3926795
## freq       0.5900456 0.5775900 0.5530355 0.8133875 0.5319380 0.5546622
## freq_sq    0.2837303 0.5822999 0.5477087 0.8084940 0.5347186 0.5597855
## crossbuy   0.7957865 0.4096734 0.7721201 0.9354499 0.7535056 0.7892283
## sow        0.8057391 0.8303523 0.4094313 0.9193636 0.7695340 0.7986284
## industry   0.9519238 0.9619299 0.9436034 0.6855060 0.9396089 0.9586391
## revenue    0.7799464 0.8077215 0.7460288 0.9187192 0.3812926 0.7642251
## employees  0.7940554 0.8203825 0.7702042 0.9240860 0.7630708 0.3913008
as.matrix(mindepth$sub.order) %>%
  reshape2::melt() %>%
  data.frame() %>%
  ggplot(aes(x = Var1, y = Var2, fill = value)) +
    scale_x_discrete(position = "top") +
    geom_tile(color = "white") +
    viridis::scale_fill_viridis("Relative min. depth") +
    labs(x = "", y = "") +
    theme_bw()

# cross-check with vimp
find.interaction(model,
                      method = "vimp",
                      importance = "permute")
## Pairing ret_exp with ret_exp_sq 
## Pairing ret_exp with profit 
## Pairing ret_exp with freq_sq 
## Pairing ret_exp with freq 
## Pairing ret_exp with acq_exp 
## Pairing ret_exp with acq_exp_sq 
## Pairing ret_exp with crossbuy 
## Pairing ret_exp with employees 
## Pairing ret_exp with industry 
## Pairing ret_exp with sow 
## Pairing ret_exp with revenue 
## Pairing ret_exp_sq with profit 
## Pairing ret_exp_sq with freq_sq 
## Pairing ret_exp_sq with freq 
## Pairing ret_exp_sq with acq_exp 
## Pairing ret_exp_sq with acq_exp_sq 
## Pairing ret_exp_sq with crossbuy 
## Pairing ret_exp_sq with employees 
## Pairing ret_exp_sq with industry 
## Pairing ret_exp_sq with sow 
## Pairing ret_exp_sq with revenue 
## Pairing profit with freq_sq 
## Pairing profit with freq 
## Pairing profit with acq_exp 
## Pairing profit with acq_exp_sq 
## Pairing profit with crossbuy 
## Pairing profit with employees 
## Pairing profit with industry 
## Pairing profit with sow 
## Pairing profit with revenue 
## Pairing freq_sq with freq 
## Pairing freq_sq with acq_exp 
## Pairing freq_sq with acq_exp_sq 
## Pairing freq_sq with crossbuy 
## Pairing freq_sq with employees 
## Pairing freq_sq with industry 
## Pairing freq_sq with sow 
## Pairing freq_sq with revenue 
## Pairing freq with acq_exp 
## Pairing freq with acq_exp_sq 
## Pairing freq with crossbuy 
## Pairing freq with employees 
## Pairing freq with industry 
## Pairing freq with sow 
## Pairing freq with revenue 
## Pairing acq_exp with acq_exp_sq 
## Pairing acq_exp with crossbuy 
## Pairing acq_exp with employees 
## Pairing acq_exp with industry 
## Pairing acq_exp with sow 
## Pairing acq_exp with revenue 
## Pairing acq_exp_sq with crossbuy 
## Pairing acq_exp_sq with employees 
## Pairing acq_exp_sq with industry 
## Pairing acq_exp_sq with sow 
## Pairing acq_exp_sq with revenue 
## Pairing crossbuy with employees 
## Pairing crossbuy with industry 
## Pairing crossbuy with sow 
## Pairing crossbuy with revenue 
## Pairing employees with industry 
## Pairing employees with sow 
## Pairing employees with revenue 
## Pairing industry with sow 
## Pairing industry with revenue 
## Pairing sow with revenue 
## 
##                               Method: vimp
##                     No. of variables: 12
##            Variables sorted by VIMP?: TRUE
##    No. of variables used for pairing: 12
##     Total no. of paired interactions: 66
##             Monte Carlo replications: 1
##     Type of noising up used for VIMP: permute
## 
##                            Var 1      Var 2     Paired   Additive Difference
## ret_exp:ret_exp_sq    17297.3667 17102.6511 46197.8028 34400.0179 11797.7850
## ret_exp:profit        17297.3667  1725.5185 22033.2388 19022.8852  3010.3536
## ret_exp:freq_sq       17297.3667   522.1415 17813.0112 17819.5082    -6.4970
## ret_exp:freq          17297.3667   508.8840 17766.8767 17806.2507   -39.3740
## ret_exp:acq_exp       17297.3667    28.0447 17306.2679 17325.4114   -19.1435
## ret_exp:acq_exp_sq    17297.3667    25.7614 17268.9486 17323.1281   -54.1795
## ret_exp:crossbuy      17297.3667    11.9844 17393.0263 17309.3511    83.6752
## ret_exp:employees     17297.3667    -2.4840 17348.3886 17294.8827    53.5059
## ret_exp:industry      17297.3667     1.8402 17267.9302 17299.2070   -31.2768
## ret_exp:sow           17297.3667    -0.5027 17359.8486 17296.8640    62.9846
## ret_exp:revenue       17297.3667   -15.2566 17183.1959 17282.1102   -98.9143
## ret_exp_sq:profit     16767.5372  1701.7117 21425.1063 18469.2489  2955.8574
## ret_exp_sq:freq_sq    16767.5372   506.6615 17310.2879 17274.1987    36.0892
## ret_exp_sq:freq       16767.5372   517.2377 17307.2498 17284.7749    22.4749
## ret_exp_sq:acq_exp    16767.5372    29.7246 16804.5629 16797.2618     7.3011
## ret_exp_sq:acq_exp_sq 16767.5372    30.2726 16820.8112 16797.8098    23.0014
## ret_exp_sq:crossbuy   16767.5372    14.4920 16744.3016 16782.0292   -37.7276
## ret_exp_sq:employees  16767.5372     1.6745 16831.6354 16769.2116    62.4238
## ret_exp_sq:industry   16767.5372     5.2584 16816.8680 16772.7956    44.0724
## ret_exp_sq:sow        16767.5372    -1.6928 16735.2654 16765.8444   -30.5790
## ret_exp_sq:revenue    16767.5372   -15.7950 16751.1336 16751.7421    -0.6086
## profit:freq_sq         1693.3172   522.6640  2267.9060  2215.9812    51.9248
## profit:freq            1693.3172   500.5254  2210.2603  2193.8427    16.4177
## profit:acq_exp         1693.3172    35.5257  1671.4144  1728.8430   -57.4286
## profit:acq_exp_sq      1693.3172    18.0137  1676.2998  1711.3310   -35.0312
## profit:crossbuy        1693.3172    17.2173  1726.7914  1710.5345    16.2569
## profit:employees       1693.3172    -1.1234  1673.0515  1692.1938   -19.1423
## profit:industry        1693.3172     3.6288  1723.6518  1696.9460    26.7058
## profit:sow             1693.3172     5.8488  1681.0803  1699.1660   -18.0857
## profit:revenue         1693.3172   -13.0462  1678.1313  1680.2710    -2.1396
## freq_sq:freq            514.4935   497.0577  1204.0116  1011.5512   192.4603
## freq_sq:acq_exp         514.4935    27.6496   549.1426   542.1431     6.9995
## freq_sq:acq_exp_sq      514.4935    26.8274   529.5638   541.3210   -11.7572
## freq_sq:crossbuy        514.4935    19.1495   527.4752   533.6430    -6.1678
## freq_sq:employees       514.4935     2.2613   518.7328   516.7548     1.9780
## freq_sq:industry        514.4935     4.7807   514.0019   519.2742    -5.2723
## freq_sq:sow             514.4935     0.7332   529.4787   515.2267    14.2520
## freq_sq:revenue         514.4935   -19.8019   505.9088   494.6916    11.2172
## freq:acq_exp            504.6062    30.9421   546.5592   535.5483    11.0108
## freq:acq_exp_sq         504.6062    30.9866   542.8100   535.5929     7.2171
## freq:crossbuy           504.6062    11.8196   515.0037   516.4258    -1.4221
## freq:employees          504.6062     1.9790   515.5613   506.5852     8.9761
## freq:industry           504.6062     5.1753   515.2461   509.7815     5.4645
## freq:sow                504.6062     1.4570   506.9094   506.0632     0.8462
## freq:revenue            504.6062   -16.0468   480.4935   488.5594    -8.0660
## acq_exp:acq_exp_sq       25.9245    20.5282    45.5938    46.4526    -0.8588
## acq_exp:crossbuy         25.9245    15.9460    42.2767    41.8705     0.4062
## acq_exp:employees        25.9245    -3.4465    21.1403    22.4779    -1.3377
## acq_exp:industry         25.9245     5.7197    23.9137    31.6442    -7.7304
## acq_exp:sow              25.9245     4.4385    26.0649    30.3629    -4.2980
## acq_exp:revenue          25.9245   -14.8110    19.2627    11.1134     8.1492
## acq_exp_sq:crossbuy      34.5877    19.8991    50.9246    54.4868    -3.5622
## acq_exp_sq:employees     34.5877    -6.1188    39.2383    28.4689    10.7694
## acq_exp_sq:industry      34.5877     4.4894    37.7517    39.0771    -1.3254
## acq_exp_sq:sow           34.5877    -2.1146    36.1039    32.4731     3.6308
## acq_exp_sq:revenue       34.5877   -13.3756    18.0898    21.2121    -3.1223
## crossbuy:employees       18.6579    -8.5443    15.5589    10.1136     5.4453
## crossbuy:industry        18.6579     6.2126    25.4707    24.8705     0.6002
## crossbuy:sow             18.6579     3.2085    23.6695    21.8664     1.8032
## crossbuy:revenue         18.6579   -15.2094     6.4384     3.4484     2.9900
## employees:industry       -2.7533     3.3074     3.3960     0.5541     2.8419
## employees:sow            -2.7533     1.5458    -3.7087    -1.2075    -2.5012
## employees:revenue        -2.7533   -25.2063   -29.0859   -27.9596    -1.1262
## industry:sow              5.5231    -4.0597    10.0269     1.4633     8.5636
## industry:revenue          5.5231   -16.7013   -19.3432   -11.1782    -8.1649
## sow:revenue               5.4986   -13.5148   -17.5169    -8.0162    -9.5007

From the above output we see that the minimal depth analysis reaffirms our important variables: ret_exp and ret_exp_sq. From the find.interaction output we see that we may want to include the interactions between the following variables in our model: ret_exp:ret_exp_sq, ret_exp:profit, ret_exp:freq, ret_exp_sq:profit, ret_exp_sq:freq, and ret_exp_sq:revenue.

5. Partial Dependence Plots

model <- randomForest(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)

par(mfrow=c(4,3))
partialPlot(model, pred.data = train2, x.var = "profit") 
partialPlot(model, pred.data = train2, x.var = "acq_exp") 
partialPlot(model, pred.data = train2, x.var = "acq_exp_sq") 
partialPlot(model, pred.data = train2, x.var = "ret_exp") 
partialPlot(model, pred.data = train2, x.var = "ret_exp_sq") 
partialPlot(model, pred.data = train2, x.var = "freq") 
partialPlot(model, pred.data = train2, x.var = "freq_sq") 
partialPlot(model, pred.data = train2, x.var = "crossbuy") 
partialPlot(model, pred.data = train2, x.var = "sow") 
partialPlot(model, pred.data = train2, x.var = "industry") 
partialPlot(model, pred.data = train2, x.var = "revenue") 
partialPlot(model, pred.data = train2, x.var = "employees") 

Tuned Random Forest with interaction terms

set.seed(100)

model <- rfsrc(duration ~ profit + acq_exp + acq_exp_sq + ret_exp + ret_exp_sq + freq + freq_sq + crossbuy + sow + industry + revenue + employees + ret_exp*ret_exp_sq + ret_exp:profit + ret_exp:freq + ret_exp_sq:profit + ret_exp_sq:freq + ret_exp_sq:revenue, data = train2, mtry = 6, nodesize = 2, ntree = 5000, importance=T)

model
##                          Sample size: 204
##                      Number of trees: 5000
##            Forest terminal node size: 2
##        Average no. of terminal nodes: 63.4598
## No. of variables tried at each split: 6
##               Total no. of variables: 12
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 129
##                             Analysis: RF-R
##                               Family: regr
##                       Splitting rule: mse *random*
##        Number of random split points: 10
##                 % variance explained: 96.98
##                           Error rate: 1398.52
preds = predict(model,test2)

preds = preds$predicted

mse = mean(((preds-test2$duration)^2))

print(mse)
## [1] 2137.259
mse.list[5] = mse

We can see that we get no improvement upon including interaction terms.

Model Comparisons for Duration

method2 = c("Linear Regression (no squared terms)", "Linear Regression (squared terms)", "Random Forest", "Tuned Random Forest", "Tuned Random Forest (w interaction terms)")
mse.df = cbind(Method = method2, MSE = mse.list)
kable(mse.df)
Method MSE
Linear Regression (no squared terms) 2223.70103310312
Linear Regression (squared terms) 3499.42139613396
Random Forest 2221.88348983668
Tuned Random Forest 2137.25850520586
Tuned Random Forest (w interaction terms) 2137.25850520585

As we can see from the above table, the tuned Random Forest model is the best at predicting duration based on the MSE.