library(caret)
library(tree)
library(SMCRM)
library(PerformanceAnalytics)
library(xgboost)
library(randomForestSRC)
library(rpart)
library(car)
library(dplyr)
# theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
# Load the data
data(acquisitionRetention)
#Loading dataset
data1 = data.frame(acquisitionRetention)
str(acquisitionRetention)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
# Get the summary
summary(data1)
## customer acquisition duration profit
## Min. : 1.0 Min. :0.000 Min. : 0.0 Min. :-1027.0
## 1st Qu.:125.8 1st Qu.:0.000 1st Qu.: 0.0 1st Qu.: -316.3
## Median :250.5 Median :1.000 Median : 957.5 Median : 3369.9
## Mean :250.5 Mean :0.676 Mean : 742.5 Mean : 2403.8
## 3rd Qu.:375.2 3rd Qu.:1.000 3rd Qu.:1146.2 3rd Qu.: 3931.6
## Max. :500.0 Max. :1.000 Max. :1673.0 Max. : 6134.3
## acq_exp ret_exp acq_exp_sq ret_exp_sq
## Min. : 1.21 Min. : 0.0 Min. : 1.5 Min. : 0
## 1st Qu.: 384.14 1st Qu.: 0.0 1st Qu.: 147562.0 1st Qu.: 0
## Median : 491.66 Median : 398.1 Median : 241729.7 Median : 158480
## Mean : 493.35 Mean : 336.3 Mean : 271211.1 Mean : 184000
## 3rd Qu.: 600.21 3rd Qu.: 514.3 3rd Qu.: 360246.0 3rd Qu.: 264466
## Max. :1027.04 Max. :1095.0 Max. :1054811.2 Max. :1198937
## freq freq_sq crossbuy sow
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 6.00 Median : 36.00 Median : 5.000 Median : 44.00
## Mean : 6.22 Mean : 69.25 Mean : 4.052 Mean : 38.88
## 3rd Qu.:11.00 3rd Qu.:121.00 3rd Qu.: 7.000 3rd Qu.: 66.00
## Max. :21.00 Max. :441.00 Max. :11.000 Max. :116.00
## industry revenue employees
## Min. :0.000 Min. :14.49 Min. : 18.0
## 1st Qu.:0.000 1st Qu.:33.53 1st Qu.: 503.0
## Median :1.000 Median :41.43 Median : 657.5
## Mean :0.522 Mean :40.54 Mean : 671.5
## 3rd Qu.:1.000 3rd Qu.:47.52 3rd Qu.: 826.0
## Max. :1.000 Max. :65.10 Max. :1461.0
# Create a copy
data2 <- data1
str(data1)
## 'data.frame': 500 obs. of 15 variables:
## $ customer : num 1 2 3 4 5 6 7 8 9 10 ...
## $ acquisition: num 1 1 1 0 1 1 1 1 0 0 ...
## $ duration : num 1635 1039 1288 0 1631 ...
## $ profit : num 6134 3524 4081 -638 5446 ...
## $ acq_exp : num 694 460 249 638 589 ...
## $ ret_exp : num 972 450 805 0 920 ...
## $ acq_exp_sq : num 480998 211628 62016 407644 346897 ...
## $ ret_exp_sq : num 943929 202077 648089 0 846106 ...
## $ freq : num 6 11 21 0 2 7 15 13 0 0 ...
## $ freq_sq : num 36 121 441 0 4 49 225 169 0 0 ...
## $ crossbuy : num 5 6 6 0 9 4 5 5 0 0 ...
## $ sow : num 95 22 90 0 80 48 51 23 0 0 ...
## $ industry : num 1 0 0 0 0 1 0 1 0 1 ...
## $ revenue : num 47.2 45.1 29.1 40.6 48.7 ...
## $ employees : num 898 686 1423 181 631 ...
# Run Correlation chart including the yvar
chart.Correlation(data1, histogram=TRUE, pch=19)
Most of the variables are heavily correlated with response variable.
duration, profit, ret_exp, ret_exp_sq, freq, freq_sq, crossbuy and sow are heavily correlated.
Lets run some box plots to see if this makes sense.
# Create box plots to understand variable correlation with the response variable Acquistion
par(mfrow = c(2, 3))
boxplot(duration~ acquisition, data = data1, xlab = "acquisition", ylab = "duration")
boxplot(ret_exp~ acquisition, data = data1, xlab = "acquisition", ylab = "ret_exp")
boxplot(freq~ acquisition, data = data1, xlab = "acquisition", ylab = "freq")
boxplot(crossbuy~ acquisition, data = data1, xlab = "acquisition", ylab = "crossbuy")
boxplot(sow~ acquisition, data = data1, xlab = "acquisition", ylab = "sow")
boxplot(profit~ acquisition, data = data1, xlab = "acquisition", ylab = "profit")
It is clear that whenever the fields Duration, ret_exp, freq, crossbuy, sow are “0”, acquisition is always biased as 0.
Similarly, whenever profit is negative, acquisition is always 0 indicating that marketing costs are included for acqusition.
We will exclude these variables in model.
we will also exclude the “customer” variable in the model as it is just a number and doesnt have any influence on the model.
# Convert the acquisition as a factor
data1$acquisition<- as.factor(data1$acquisition)
# See if any missing values and remove them
data1 <- na.omit(data1)
# Run a random forest model to predict acqusition using only the kept fields based on the above correlation analysis
# set the seed
set.seed(123)
# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))
# Set ntree to 1000 and importance to True to see the variable importance
# Run a randomforest with only the fields industry, revenue, employees and acq_exp
rf.forest1 <- rfsrc(acquisition~acq_exp+industry+revenue+employees,
data = data1,
tuneGrid=tunegrid,
importance = TRUE,
ntree = 1000)
rf.forest1
## Sample size: 500
## Frequency of class labels: 162, 338
## Number of trees: 1000
## Forest terminal node size: 1
## Average no. of terminal nodes: 76.809
## No. of variables tried at each split: 2
## Total no. of variables: 4
## Resampling used to grow trees: swor
## Resample size used to grow trees: 316
## Analysis: RF-C
## Family: class
## Splitting rule: gini *random*
## Number of random split points: 10
## Normalized brier score: 57.26
## AUC: 85.97
## Error rate: 0.21, 0.41, 0.12
##
## Confusion matrix:
##
## predicted
## observed 0 1 class.error
## 0 96 66 0.4074
## 1 40 298 0.1183
##
## Overall error rate: 21.2%
This gave an accuracy rate of 78.8%.
# create a dataframe from the prediction values
predictedAcqusition <- rf.forest1$class.oob
data2<-cbind(data1,predictedAcqusition)
# Keep only the predicted acquisition customers only in a new dataframe
data3 <- data2 %>% filter(predictedAcqusition == 1)
summary(data3)
## customer acquisition duration profit acq_exp
## Min. : 1.0 0: 66 Min. : 0.0 Min. :-1027 Min. : 43.15
## 1st Qu.:130.8 1:298 1st Qu.: 813.5 1st Qu.: 3028 1st Qu.: 407.69
## Median :258.0 Median :1020.5 Median : 3555 Median : 496.07
## Mean :256.7 Mean : 894.2 Mean : 3028 Mean : 492.25
## 3rd Qu.:386.2 3rd Qu.:1175.2 3rd Qu.: 4049 3rd Qu.: 579.63
## Max. :500.0 Max. :1673.0 Max. : 6134 Max. :1027.04
## ret_exp acq_exp_sq ret_exp_sq freq
## Min. : 0.0 Min. : 1862 Min. : 0 Min. : 0.000
## 1st Qu.: 322.1 1st Qu.: 166208 1st Qu.: 103779 1st Qu.: 3.000
## Median : 439.1 Median : 246087 Median : 192835 Median : 8.000
## Mean : 404.9 Mean : 262301 Mean : 219752 Mean : 7.527
## 3rd Qu.: 538.4 3rd Qu.: 335971 3rd Qu.: 289858 3rd Qu.:11.000
## Max. :1082.4 Max. :1054811 Max. :1171525 Max. :21.000
## freq_sq crossbuy sow industry
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 9.00 1st Qu.: 3.00 1st Qu.: 29.75 1st Qu.:0.0000
## Median : 64.00 Median : 5.00 Median : 52.00 Median :1.0000
## Mean : 84.27 Mean : 4.92 Mean : 46.78 Mean :0.5907
## 3rd Qu.:121.00 3rd Qu.: 7.00 3rd Qu.: 68.00 3rd Qu.:1.0000
## Max. :441.00 Max. :11.00 Max. :116.00 Max. :1.0000
## revenue employees predictedAcqusition
## Min. :15.86 Min. : 56.0 0: 0
## 1st Qu.:35.98 1st Qu.: 606.8 1:364
## Median :43.06 Median : 744.5
## Mean :42.21 Mean : 753.7
## 3rd Qu.:48.16 3rd Qu.: 895.2
## Max. :65.10 Max. :1461.0
# Run Correlation chart including all variables
chart.Correlation(data3[,c(3,5:6,9,11:15)], histogram=TRUE, pch=19)
# Lets run a GLM model to see the variable collinearity with all variables
set.seed(123)
glm.fit1 <- glm(duration~profit+acq_exp+freq+ret_exp+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm1 = predict(glm.fit1, newdata = data3)
glm1.mse = mean((preds.glm1 - data3$duration)^2)
glm1.mse
## [1] 2647.914
car::vif(glm.fit1)
## profit acq_exp freq ret_exp crossbuy sow industry revenue
## 18.315825 1.131901 1.737211 10.056359 2.808369 2.559769 1.079648 1.091787
## employees
## 1.265496
Here profit and ret_exp are highly collinear as indicated in the correlation chart. Lets remove ret_exp as we are dealing with acquisition and see how the model collinerity is.
# Lets run a GLM model to see the variable collinearity with all variables
set.seed(123)
glm.fit2 <- glm(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees, data=data3)
preds.glm2 = predict(glm.fit2, newdata = data3)
glm2.mse = mean((preds.glm2 - data3$duration)^2)
glm2.mse
## [1] 6219.244
car::vif(glm.fit2)
## profit acq_exp freq crossbuy sow industry revenue employees
## 4.000792 1.039238 1.736314 2.617264 2.331227 1.058091 1.087949 1.245028
Even though the MSE is high, there is no collinearity involved.
# Lets create a random forest model to predict duration only for the predicted acquisition customers
# Run a random forest REGRESSION model to predict duration with all fields except the transformed squared variables and ret_exp
# set the seed
set.seed(123)
# Set a tune grid to run at different mtry's
tunegrid <- expand.grid(.mtry=c(1:15))
# Set ntree to 1000 and importance to True to see the variable importance
# Run a randomforest with all the fields
rf.forest2 <- rfsrc(duration~profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
data = data3,
tuneGrid=tunegrid,
importance = TRUE,
ntree = 1000)
rf.forest2
## Sample size: 364
## Number of trees: 1000
## Forest terminal node size: 5
## Average no. of terminal nodes: 39.23
## No. of variables tried at each split: 3
## Total no. of variables: 8
## Resampling used to grow trees: swor
## Resample size used to grow trees: 230
## Analysis: RF-R
## Family: regr
## Splitting rule: mse *random*
## Number of random split points: 10
## % variance explained: 97.1
## Error rate: 6232.35
# Get the predicted duration and write into a dataframe
predictedDuration <- predict(rf.forest2,data3)$predicted
data4 <- cbind(data3,predictedDuration)
# Calculate the actual duration and the predicted duration (how long) for the customers predicted to be acquired
TotalActualDuration <- sum(data4$duration)
TotalPredictedDuration <- sum(data4$predictedDuration)
cat("TotalActualDuration = ", TotalActualDuration)
## TotalActualDuration = 325490
cat("\nTotalPredictedDuration = ", TotalPredictedDuration)
##
## TotalPredictedDuration = 325819.2
# write the predicted acquisition (or) acquired customers and their predicted duration to a CSV file
write.csv(data4,file="casestudy4.csv",sep=",")
## Warning in write.csv(data4, file = "casestudy4.csv", sep = ","): attempt to set
## 'sep' ignored
rf.forest2$importance
## profit acq_exp freq crossbuy sow industry
## 62919.52006 3598.16319 37340.18739 54590.92936 22353.78101 125.35100
## revenue employees
## -27.07561 41.13362
# Creating a plot of variable importance
rf.forest2$importance # values vary a lot
## profit acq_exp freq crossbuy sow industry
## 62919.52006 3598.16319 37340.18739 54590.92936 22353.78101 125.35100
## revenue employees
## -27.07561 41.13362
# Use this if the question is related to variable importance on a classification model
# importance = rf.forest1$importance[,c(1)] # values vary a lot
data.frame(importance = rf.forest2$importance) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "orange", color = "black")+
coord_flip() +
labs(x = "Variables", y = "Variable importance")+
theme_nice # where's industy? difficult to see
rf.forest2$importance %>% log() # log transform
## Warning in log(.): NaNs produced
## profit acq_exp freq crossbuy sow industry revenue employees
## 11.049612 8.188179 10.527825 10.907623 10.014751 4.831118 NaN 3.716826
data.frame(importance = rf.forest2$importance) %>% # add a large +ve constant
log() %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,importance), y = importance)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.5)+
coord_flip() +
labs(x = "Variables", y = "Log-transformed variable importance") +
theme_nice
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 1 rows containing missing values (position_stack).
As all the values are less than 1 and positive, we dont need any transformation such as log or adding a large constant etc.
mindepth <- max.subtree(rf.forest2,
sub.order = TRUE)
# first order depths
print(round(mindepth$order, 3)[,1])
## profit acq_exp freq crossbuy sow industry revenue employees
## 1.635 3.297 2.279 2.274 3.097 7.579 4.109 3.778
# vizualise MD
data.frame(md = round(mindepth$order, 3)[,1]) %>%
tibble::rownames_to_column(var = "variable") %>%
ggplot(aes(x = reorder(variable,desc(md)), y = md)) +
geom_bar(stat = "identity", fill = "orange", color = "black", width = 0.2)+
coord_flip() +
labs(x = "Variables", y = "Minimal Depth")+
theme_nice
# interactions
mindepth$sub.order
## profit acq_exp freq crossbuy sow industry revenue
## profit 0.1399196 0.2264888 0.3205463 0.4089828 0.3510775 0.6802229 0.3576138
## acq_exp 0.2516335 0.2862922 0.4637345 0.5535774 0.4576455 0.7624997 0.4606001
## freq 0.3002827 0.3989477 0.2003254 0.5225195 0.4786879 0.7711915 0.4733314
## crossbuy 0.3387873 0.4275252 0.4883598 0.1999977 0.5227186 0.7921141 0.5137861
## sow 0.3905999 0.4891231 0.5645602 0.6259963 0.2717470 0.8379984 0.5698369
## industry 0.7872336 0.8231147 0.8627345 0.8951583 0.8713991 0.6518157 0.8634710
## revenue 0.4425062 0.5181028 0.6428629 0.6992784 0.6370630 0.8839927 0.3598838
## employees 0.3929604 0.4813238 0.6074370 0.6585310 0.5811056 0.8555074 0.5887954
## employees
## profit 0.3300100
## acq_exp 0.4277176
## freq 0.4782207
## crossbuy 0.5056605
## sow 0.5782995
## industry 0.8591563
## revenue 0.6262199
## employees 0.3291201
as.matrix(mindepth$sub.order) %>%
reshape2::melt() %>%
data.frame() %>%
ggplot(aes(x = Var1, y = Var2, fill = value)) +
scale_x_discrete(position = "top") +
geom_tile(color = "white") +
viridis::scale_fill_viridis("Relative min. depth") +
labs(x = "", y = "") +
theme_bw()
# cross-check with vimp
find.interaction(rf.forest2,
method = "vimp",
importance = "permute")
## Pairing profit with crossbuy
## Pairing profit with freq
## Pairing profit with sow
## Pairing profit with acq_exp
## Pairing profit with industry
## Pairing profit with employees
## Pairing profit with revenue
## Pairing crossbuy with freq
## Pairing crossbuy with sow
## Pairing crossbuy with acq_exp
## Pairing crossbuy with industry
## Pairing crossbuy with employees
## Pairing crossbuy with revenue
## Pairing freq with sow
## Pairing freq with acq_exp
## Pairing freq with industry
## Pairing freq with employees
## Pairing freq with revenue
## Pairing sow with acq_exp
## Pairing sow with industry
## Pairing sow with employees
## Pairing sow with revenue
## Pairing acq_exp with industry
## Pairing acq_exp with employees
## Pairing acq_exp with revenue
## Pairing industry with employees
## Pairing industry with revenue
## Pairing employees with revenue
##
## Method: vimp
## No. of variables: 8
## Variables sorted by VIMP?: TRUE
## No. of variables used for pairing: 8
## Total no. of paired interactions: 28
## Monte Carlo replications: 1
## Type of noising up used for VIMP: permute
##
## Var 1 Var 2 Paired Additive Difference
## profit:crossbuy 63075.9562 55542.0686 144710.2118 118618.0248 26092.1870
## profit:freq 63075.9562 37727.4432 118575.7512 100803.3994 17772.3518
## profit:sow 63075.9562 21976.4935 97062.6351 85052.4497 12010.1854
## profit:acq_exp 63075.9562 3469.3948 61971.0645 66545.3510 -4574.2865
## profit:industry 63075.9562 52.9427 63277.5128 63128.8989 148.6139
## profit:employees 63075.9562 130.7931 62952.0255 63206.7493 -254.7238
## profit:revenue 63075.9562 -12.2320 63162.9360 63063.7242 99.2117
## crossbuy:freq 55482.8485 37614.0256 113120.6145 93096.8742 20023.7403
## crossbuy:sow 55482.8485 22088.6401 87676.6559 77571.4887 10105.1673
## crossbuy:acq_exp 55482.8485 3530.7137 58758.6047 59013.5622 -254.9576
## crossbuy:industry 55482.8485 55.3493 56288.5705 55538.1978 750.3727
## crossbuy:employees 55482.8485 135.8067 55891.8118 55618.6552 273.1566
## crossbuy:revenue 55482.8485 53.3064 55719.0531 55536.1549 182.8982
## freq:sow 37784.7137 21637.8438 63436.6884 59422.5575 4014.1309
## freq:acq_exp 37784.7137 3523.1716 40690.1692 41307.8852 -617.7160
## freq:industry 37784.7137 38.7850 37528.1332 37823.4986 -295.3654
## freq:employees 37784.7137 192.7261 37144.1947 37977.4398 -833.2451
## freq:revenue 37784.7137 -22.8234 37699.7775 37761.8902 -62.1128
## sow:acq_exp 21742.8296 3462.1572 24222.9264 25204.9868 -982.0604
## sow:industry 21742.8296 56.5913 21183.8711 21799.4209 -615.5498
## sow:employees 21742.8296 188.9560 22719.7225 21931.7856 787.9369
## sow:revenue 21742.8296 -45.1399 21742.7874 21697.6897 45.0978
## acq_exp:industry 3505.8289 23.3140 3555.2209 3529.1429 26.0780
## acq_exp:employees 3505.8289 223.4505 3527.1586 3729.2794 -202.1209
## acq_exp:revenue 3505.8289 -19.1429 3592.9939 3486.6860 106.3080
## industry:employees 65.8362 184.1673 270.9310 250.0035 20.9275
## industry:revenue 65.8362 -15.5537 74.2656 50.2826 23.9830
## employees:revenue -16.0790 68.7050 -6.5460 52.6260 -59.1720
set.seed(123)
# Establish a list of possible values for hyper-parameters
mtry.values <- seq(4,6,1)
nodesize.values <- seq(4,8,2)
ntree.values <- seq(4e3,6e3,1e3)
# Create a data frame containing all combinations
hyper_grid <- expand.grid(mtry = mtry.values, nodesize = nodesize.values, ntree = ntree.values)
# Create an empty vector to store OOB error values
oob_err <- c()
# Write a loop over the rows of hyper_grid to train the grid of models
for (i in 1:nrow(hyper_grid)) {
# Train a Random Forest model
model <- rfsrc(duration ~ profit+acq_exp+freq+crossbuy+sow+industry+revenue+employees,
data = data3,
mtry = hyper_grid$mtry[i],
nodesize = hyper_grid$nodesize[i],
ntree = hyper_grid$ntree[i])
# Store OOB error for the model
oob_err[i] <- model$err.rate[length(model$err.rate)]
}
# Identify optimal set of hyperparmeters based on OOB error
opt_i <- which.min(oob_err)
print(hyper_grid[opt_i,])
## mtry nodesize ntree
## 12 6 4 5000
# Run a decision tree model
dt.model1 <- rpart(acquisition~acq_exp+industry+revenue+employees,
data = data1) # simple DT model
rattle::fancyRpartPlot(dt.model1, sub = "") # vizualize the DT
# Run the decision tree after applying Cross Validation
set.seed(123)
train.control <- trainControl(method = "CV", number=10)
dt.model2 <- train(acquisition~acq_exp+industry+revenue+employees,
data = data1,
trControl = train.control
,method = 'rpart')
dt.model2
## CART
##
## 500 samples
## 4 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 449, 450, 450, 451, 450, 450, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05555556 0.7533605 0.41364355
## 0.06790123 0.7533605 0.40826208
## 0.25308642 0.6698639 0.04969707
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.06790123.
Decision Tree gave 75.33% accuracy with CV.
t_pred = predict(dt.model1,data1,type="class")
confMat <- table(data1$acquisition,t_pred)
confMat
## t_pred
## 0 1
## 0 112 50
## 1 27 311
accuracy <- sum(diag(confMat))/sum(confMat)
accuracy
## [1] 0.846
Decision Tree gave 84.6% accuracy without CV.
# Run a logistic regression model
set.seed(123)
train.control <- trainControl(method = "cv")
glm.model1 <- train(acquisition~acq_exp_sq+industry+revenue+employees,
data=data1,method= 'glm', family= 'binomial',trControl = train.control)
summary(glm.model1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2982 -0.6250 0.2985 0.6434 2.3767
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.718e+00 7.927e-01 -8.475 < 2e-16 ***
## acq_exp_sq -1.331e-06 6.858e-07 -1.942 0.0522 .
## industry 1.599e+00 2.605e-01 6.138 8.35e-10 ***
## revenue 7.215e-02 1.337e-02 5.397 6.76e-08 ***
## employees 6.782e-03 7.222e-04 9.391 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 629.85 on 499 degrees of freedom
## Residual deviance: 418.36 on 495 degrees of freedom
## AIC: 428.36
##
## Number of Fisher Scoring iterations: 5
pred.glm.model1<-predict(glm.model1,data1)
confusionMatrix(pred.glm.model1,data1$acquisition)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 37
## 1 60 301
##
## Accuracy : 0.806
## 95% CI : (0.7686, 0.8398)
## No Information Rate : 0.676
## P-Value [Acc > NIR] : 5.918e-11
##
## Kappa : 0.5401
##
## Mcnemar's Test P-Value : 0.0255
##
## Sensitivity : 0.6296
## Specificity : 0.8905
## Pos Pred Value : 0.7338
## Neg Pred Value : 0.8338
## Prevalence : 0.3240
## Detection Rate : 0.2040
## Detection Prevalence : 0.2780
## Balanced Accuracy : 0.7601
##
## 'Positive' Class : 0
##
Logistic regression gave 80.6% accuracy.
# inspect relationship of avg_ret_exp with predicted duration with PDP
min(rf.forest2$xvar$acq_exp)
## [1] 43.15
max(rf.forest2$xvar$acq_exp)
## [1] 1027.04
acq_exp_seq = seq(40,1040,50)
# extract marginal effect using partial dependence
marginal.effect.rf2 <- partial(rf.forest2,
partial.xvar = "acq_exp",
partial.values = acq_exp_seq)
means.exp.rf2 <- marginal.effect.rf2$regrOutput$duration %>% colMeans()
marginal.effect.df.rf2 <-
data.frame(pred.duration.rf2 = means.exp.rf2, acq_exp_seq = acq_exp_seq)
ggplot(marginal.effect.df.rf2, aes(x = acq_exp_seq, y = pred.duration.rf2)) +
geom_point(shape = 21, color = "purple", size = 2, stroke = 1.2)+
geom_smooth(method = "lm", formula = y ~ poly(x,3), se = FALSE, color = "black")+ # try with other values
labs(x = "Acquisition expense in $", y = "Predicted duration") +
scale_x_continuous(breaks = seq(40,1040,50))+
theme_nice
# first check relationship between actual duration and acq_exp
ggplot(data3, aes(x = acq_exp, y = duration)) +
geom_point(shape = 21, col = "purple", size = 3) +
stat_smooth(method = "lm", se = FALSE, color = "black") +
scale_x_continuous(breaks = seq(40,1040,50)) +
scale_y_continuous(breaks = seq(0,1680,120)) +
geom_rug(sides = "b", col = "red", alpha = 0.2) +
labs(y = "Actual duration", x = "Acquisition expense in $") +
theme_nice
## `geom_smooth()` using formula 'y ~ x'
# repeat with smaller values of acq_exp
acq_exp_seq2 = seq(440,590,5)
# extract marginal effect using partial dependence
marginal.effect.rf2.new <- partial(rf.forest2,
partial.xvar = "acq_exp",
partial.values = acq_exp_seq2)
means.exp.rf2.new <- marginal.effect.rf2.new$regrOutput$duration %>% colMeans()
marginal.effect.df.rf2.new <-
data.frame(pred.duration.rf2.new = means.exp.rf2.new, acq_exp_seq2 = acq_exp_seq2)
ggplot(marginal.effect.df.rf2.new, aes(x = acq_exp_seq2, y = pred.duration.rf2.new)) +
geom_point(shape = 21, color = "purple", size = 2, stroke = 1.2)+
geom_smooth(method = "lm", formula = y ~ poly(x,3), se = FALSE, color = "black")+ # try with other values
labs(x = "Acquisition expense in $", y = "Predicted duration") +
scale_x_continuous(breaks = seq(440,590,5))+
theme_nice
lm_regr <- lm(duration ~ profit+acq_exp+acq_exp_sq+freq+freq_sq+crossbuy+sow+industry+revenue+employees,
data = data3)
summary(lm_regr)
##
## Call:
## lm(formula = duration ~ profit + acq_exp + acq_exp_sq + freq +
## freq_sq + crossbuy + sow + industry + revenue + employees,
## data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -295.179 -46.914 5.709 49.073 213.777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.590e+02 4.789e+01 11.673 < 2e-16 ***
## profit 2.981e-01 5.387e-03 55.332 < 2e-16 ***
## acq_exp -9.973e-01 1.451e-01 -6.875 2.82e-11 ***
## acq_exp_sq 6.798e-04 1.418e-04 4.795 2.40e-06 ***
## freq -5.528e+00 3.500e+00 -1.579 0.11515
## freq_sq 1.252e-03 1.780e-01 0.007 0.99439
## crossbuy -7.244e+00 2.246e+00 -3.226 0.00137 **
## sow -5.946e-01 2.168e-01 -2.742 0.00641 **
## industry -4.021e+01 8.704e+00 -4.620 5.39e-06 ***
## revenue -9.220e-01 4.529e-01 -2.036 0.04252 *
## employees -1.158e-01 2.168e-02 -5.340 1.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 77.59 on 353 degrees of freedom
## Multiple R-squared: 0.9727, Adjusted R-squared: 0.972
## F-statistic: 1260 on 10 and 353 DF, p-value: < 2.2e-16