data("acquisitionRetention")
acquisitionRetention$acquisition=as.factor(acquisitionRetention$acquisition)
#anyNA(acquisitionRetention)
index = sample(nrow(acquisitionRetention),0.8*nrow(acquisitionRetention))
train = acquisitionRetention[index,]
test= acquisitionRetention[-index,]
model <- glm(acquisition ~
industry +
acq_exp +
revenue +
employees, data = train, family = binomial)
#cooks distance plot
dev <-residuals(model, type = "deviance")
pea <-residuals(model, type = "pearson")
devstd<-residuals(model, type = "deviance")/sqrt(1 - hatvalues(model)) # standardized deviance residuals
peastd <-residuals(model, type = "pearson")/sqrt(1 - hatvalues(model)) # standardized pearson residuals
dev.new(width = 1000, height = 1000, unit = "px")
par(mfrow=c(1,2))
plot(devstd[model$model$acquisition==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. deviance residuals", xlab = "ID")
points(devstd[model$model$acquisition==0], col = "blue")
plot(peastd[model$model$acquisition==1], col = "red",
ylim = c(-3.5,3.5), ylab = "std. Pearson residuals", xlab = "ID")
points(peastd[model$model$acquisition==0], col = "blue")
#model selection
#taking out outliers
cooks_sd_cutoff = 3*sd(cooks.distance(model))
out <- which(cooks.distance(model)>cooks_sd_cutoff)
model <- glm(acquisition ~
industry +
acq_exp +
revenue +
employees, data = train[-out,], family = binomial)
summary(model)
##
## Call:
## glm(formula = acquisition ~ industry + acq_exp + revenue + employees,
## family = binomial, data = train[-out, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2189 -0.3795 0.1791 0.5384 2.3296
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.4494662 1.2223138 -7.731 1.07e-14 ***
## industry 1.7265686 0.3341271 5.167 2.37e-07 ***
## acq_exp -0.0004982 0.0009393 -0.530 0.596
## revenue 0.0957225 0.0178870 5.352 8.72e-08 ***
## employees 0.0097873 0.0011185 8.750 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 474.77 on 389 degrees of freedom
## Residual deviance: 264.02 on 385 degrees of freedom
## AIC: 274.02
##
## Number of Fisher Scoring iterations: 6
#Maximized cutoff (change predict object)
pred <- prediction(predict(model, test, type = "response"),
test$acquisition)
#Predicted Probability and True Classification
auc <- round(as.numeric(performance(pred, measure = "auc")@y.values),3)
false.rates <-performance(pred, "fpr","fnr")
accuracy <-performance(pred, "acc","err")
plot(unlist(performance(pred, "sens")@x.values), unlist(performance(pred, "sens")@y.values),
type="l", lwd=2,
ylab="Sensitivity", xlab="Cutoff", main = paste("Maximized Cutoff\n","AUC: ",auc))
par(new=TRUE)
plot(unlist(performance(pred, "spec")@x.values), unlist(performance(pred, "spec")@y.values),
type="l", lwd=2, col='red', ylab="", xlab="")
axis(4, at=seq(0,1,0.2))
mtext("Specificity",side=4, padj=-2, col='red')
min.diff <-which.min(abs(unlist(performance(pred, "sens")@y.values) - unlist(performance(pred, "spec")@y.values)))
min.x<-unlist(performance(pred, "sens")@x.values)[min.diff]
min.y<-unlist(performance(pred, "spec")@y.values)[min.diff]
optimal <-min.x
abline(h = min.y, lty = 3)
abline(v = min.x, lty = 3)
text(min.x,0,paste("optimal threshold=",round(optimal,5)), pos = 4)
#ROC plot (don't need to change)
perf <- performance(pred, "tpr","fpr")
plot(perf,colorize = T, main = "ROC Curve")
text(0.5,0.5, paste("AUC:", auc))
#Confusion Matrix (change predict object)
predict = predict(model, test, type = "response")
#optcutoff = optimalCutoff(test$acquisition,predict)
predict_choice = ifelse(predict>optimal, 1,0)
caret::confusionMatrix(as.factor(predict_choice),as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 29 17
## 1 10 44
##
## Accuracy : 0.73
## 95% CI : (0.632, 0.8139)
## No Information Rate : 0.61
## P-Value [Acc > NIR] : 0.008104
##
## Kappa : 0.4503
##
## Mcnemar's Test P-Value : 0.248213
##
## Sensitivity : 0.7213
## Specificity : 0.7436
## Pos Pred Value : 0.8148
## Neg Pred Value : 0.6304
## Prevalence : 0.6100
## Detection Rate : 0.4400
## Detection Prevalence : 0.5400
## Balanced Accuracy : 0.7325
##
## 'Positive' Class : 1
##
#diagnostics
cooks_distance = cooks.distance(model)
plot(cooks_distance,col="pink", pch=19, cex=1)
text(cooks.distance(model),labels = rownames(train))
exp(coef(model))
## (Intercept) industry acq_exp revenue employees
## 7.873158e-05 5.621332e+00 9.995019e-01 1.100454e+00 1.009835e+00
LR = Anova(model, test = "LR")
Wald = Anova(model, test = "Wald")
score = anova(model, test = "Rao")
data.frame(LR = LR[,1], Score = score$Rao[2], Wald = Wald$Chisq)
## LR Score Wald
## 1 30.7481715 25.72755 26.7020287
## 2 0.2819222 25.72755 0.2812974
## 3 34.1474828 25.72755 28.6388012
## 4 158.5480790 25.72755 76.5652389
data.frame(p.LR = LR$`Pr(>Chisq)`, p.Score = score$`Pr(>Chi)`[2], p.Wald = Wald$`Pr(>Chisq)`)
## p.LR p.Score p.Wald
## 1 2.937805e-08 3.931739e-07 2.373722e-07
## 2 5.954441e-01 3.931739e-07 5.958521e-01
## 3 5.108929e-09 3.931739e-07 8.721692e-08
## 4 2.349007e-36 3.931739e-07 2.130570e-18
Using a cutoff value of 0.719 we get an accuracy of 75% with logistic regression.
make sure response its a factor
model <- rpart(as.factor(acquisition) ~
industry +
acq_exp +
revenue +
employees,
data = train) # simple DT model
rattle::fancyRpartPlot(model, sub = "") # vizualize the DT
# save predictions from DT
preds = predict(model,test,type="class")
caret::confusionMatrix(as.factor(preds),as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21 14
## 1 18 47
##
## Accuracy : 0.68
## 95% CI : (0.5792, 0.7698)
## No Information Rate : 0.61
## P-Value [Acc > NIR] : 0.09018
##
## Kappa : 0.3148
##
## Mcnemar's Test P-Value : 0.59588
##
## Sensitivity : 0.7705
## Specificity : 0.5385
## Pos Pred Value : 0.7231
## Neg Pred Value : 0.6000
## Prevalence : 0.6100
## Detection Rate : 0.4700
## Detection Prevalence : 0.6500
## Balanced Accuracy : 0.6545
##
## 'Positive' Class : 1
##
We get an accuracy of 74% with decision tree method.
set.seed(100)
model <- rfsrc(acquisition ~
industry +
acq_exp +
revenue +
employees,
data = train,
importance = TRUE,
ntree = 1000)
model
## Sample size: 400
## Frequency of class labels: 123, 277
## Number of trees: 1000
## Forest terminal node size: 1
## Average no. of terminal nodes: 57.611
## No. of variables tried at each split: 2
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 253
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Normalized brier score: 50.31
## AUC: 88.62
## Error rate: 0.18, 0.38, 0.09
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 76 47 0.3821
## 1 26 251 0.0939
##
## Overall error rate: 18.25%
#model$importance # values vary a lot
#data.frame(importance = model$importance) %>%
# tibble::rownames_to_column(var = "variable") %>%
# ggplot(aes(x = reorder(variable,importance), y = importance)) +
# geom_bar(stat = "identity", fill = "orange", color = "black")+
# coord_flip() +
# labs(x = "Variables", y = "Variable importance")
mindepth <- max.subtree(model,
sub.order = TRUE)
# first order depths
print(round(mindepth$order, 3)[,1])
## industry acq_exp revenue employees
## 1.586 1.440 1.335 0.663
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
coord_flip() +
labs(x = "Variables", y = "Minimal Depth")
# interactions
mindepth$sub.order
## industry acq_exp revenue employees
## industry 0.1400193 0.1714415 0.1865178 0.17208282
## acq_exp 0.4324688 0.1298402 0.1719492 0.15711450
## revenue 0.4505814 0.1595010 0.1188671 0.15172322
## employees 0.3424350 0.1460361 0.1525367 0.05853769
as.matrix(mindepth$sub.order) %>%
reshape2::melt() %>%
data.frame() %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
scale_x_discrete(position = "top") +
geom_tile(color = "white") +
viridis::scale_fill_viridis("Relative min. depth") +
labs(x = "", y = "") +
theme_bw()
# cross-check with vimp
find.interaction(model,
method = "vimp",
importance = "permute")
## Pairing employees with acq_exp
## Pairing employees with industry
## Pairing employees with revenue
## Pairing acq_exp with industry
## Pairing acq_exp with revenue
## Pairing industry with revenue
##
## Method: vimp
## No. of variables: 4
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 4
## Total no. of paired interactions: 6
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## employees:acq_exp 0.0850 0.0303 0.1147 0.1153 -0.0006
## employees:industry 0.0850 0.0274 0.1047 0.1124 -0.0077
## employees:revenue 0.0850 0.0230 0.1014 0.1081 -0.0067
## acq_exp:industry 0.0298 0.0286 0.0441 0.0584 -0.0143
## acq_exp:revenue 0.0298 0.0236 0.0569 0.0534 0.0035
## industry:revenue 0.0287 0.0218 0.0511 0.0504 0.0007
All the differences are negligible so we can exclude interaction terms.
# Establish a list of possible values for hyper-parameters
mtry.values <- seq(1,4,1)
nodesize.values <- seq(2,16,2)
ntree.values <- seq(1e3,10e3,1e3)
# Create a data frame containing all combinations
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)
# Create an empty vector to store OOB error values
oob_err <- c()
# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {
# Train a Random Forest model
model <- rfsrc(acquisition ~
industry +
acq_exp +
revenue +
employees,
data = train,
mtry = hyper_grid$mtry[i],
nodesize = hyper_grid$nodesize[i],
ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
## mtry nodesize ntree
## 5 1 4 1000
2 8 10000
set.seed(100)
model <- rfsrc(as.factor(acquisition) ~
industry +
acq_exp +
revenue +
employees,
data = train,
mtry = 2,
nodesize = 8,
ntree = 10000)
model
## Sample size: 400
## Frequency of class labels: 123, 277
## Number of trees: 10000
## Forest terminal node size: 8
## Average no. of terminal nodes: 23.0283
## No. of variables tried at each split: 2
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 253
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Normalized brier score: 49.9
## AUC: 89.16
## Error rate: 0.19, 0.37, 0.11
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 78 45 0.3659
## 1 30 247 0.1083
##
## Overall error rate: 18.75%
preds = predict(model,test,type="class")
caret::confusionMatrix(preds$class,as.factor(test$acquisition), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 20 13
## 1 19 48
##
## Accuracy : 0.68
## 95% CI : (0.5792, 0.7698)
## No Information Rate : 0.61
## P-Value [Acc > NIR] : 0.09018
##
## Kappa : 0.3083
##
## Mcnemar's Test P-Value : 0.37676
##
## Sensitivity : 0.7869
## Specificity : 0.5128
## Pos Pred Value : 0.7164
## Neg Pred Value : 0.6061
## Prevalence : 0.6100
## Detection Rate : 0.4800
## Detection Prevalence : 0.6700
## Balanced Accuracy : 0.6499
##
## 'Positive' Class : 1
##
data=acquisitionRetention
data2 = data.frame(data,prediction=preds$class)
data2=data2[data2$prediction==1 & data2$acquisition==1, ]
index2 = sample(nrow(data2),0.8*nrow(data2))
train2 = data2[index2,]
test2 = data2[-index2,]
model <- lm(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees, data = train2)
preds = predict(model,test2)
mse = ((preds-test2$duration)^2)
summary(model)
##
## Call:
## lm(formula = duration ~ profit + acq_exp + ret_exp + freq + crossbuy +
## sow + industry + revenue + employees, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -199.880 -19.249 2.385 26.870 103.183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 418.851748 33.264256 12.592 < 2e-16 ***
## profit 0.166750 0.029596 5.634 6.96e-08 ***
## acq_exp -0.314110 0.065558 -4.791 3.53e-06 ***
## ret_exp 0.730575 0.105709 6.911 8.73e-11 ***
## freq -6.168016 0.849070 -7.264 1.21e-11 ***
## crossbuy -0.991080 1.684312 -0.588 0.557015
## sow -0.009601 0.167230 -0.057 0.954281
## industry -27.144076 7.139536 -3.802 0.000198 ***
## revenue -0.400482 0.369725 -1.083 0.280223
## employees -0.085291 0.017430 -4.893 2.25e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.48 on 174 degrees of freedom
## Multiple R-squared: 0.9616, Adjusted R-squared: 0.9596
## F-statistic: 484.5 on 9 and 174 DF, p-value: < 2.2e-16
print(mean(mse))
## [1] 2008.668
print(mse(preds, test2$duration))
## [1] 2008.668
set.seed(100)
model <- rfsrc(duration ~profit+acq_exp+ret_exp+freq+crossbuy+sow+industry+revenue+employees,
mtry = 2,
nodesize = 8,
ntree = 10000, data=train2)
preds = predict(model,test2)
#print(mse(preds$predicted,test2$duration))
mse = ((preds$predicted-test2$duration)^2)
print(mean(mse))
## [1] 5673.281
print(mse(preds$predicted, test2$duration))
## [1] 5673.281
Something is messed up because our MSE is much higher using random forest over regression.