Sameer Mathur
Decision Tree using caret Package
---
# reading data
renegeHR.df <- read.csv(paste("CleanHRDataV2.csv"))
# attach data columns
attach(renegeHR.df)
# dimension of the dataset
dim(renegeHR.df)
[1] 8995 16
# structure of the data table
str(renegeHR.df)
'data.frame': 8995 obs. of 16 variables:
$ DOJExtend : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
$ DurToAcptOffer: int 14 18 3 26 1 17 37 16 1 6 ...
$ NoticePeriod : int 30 30 45 30 120 30 30 0 30 30 ...
$ Band : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
$ CTCHikeExp : num -20.8 50 42.8 42.8 42.6 ...
$ CTCHikeOffered: num 13.2 320 42.8 42.8 42.6 ...
$ CTCDiff : num 42.9 180 0 0 0 ...
$ JoiningBonus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ Relocate : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
$ CandidateSrc : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
$ RexInYrs : int 7 8 4 4 6 2 7 8 3 3 ...
$ LOB : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
$ Location : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
$ Age : int 34 34 27 34 34 34 32 34 26 34 ...
$ Status : Factor w/ 2 levels "Joined","NotJoined": 1 1 1 1 1 1 1 1 1 1 ...
# descriptive statistics of the dataframe
library(psych)
describe(renegeHR.df)[, c(1:5)]
vars n mean sd median
DOJExtend* 1 8995 1.47 0.50 1
DurToAcptOffer 2 8995 21.43 25.81 10
NoticePeriod 3 8995 39.29 22.22 30
Band* 4 8995 2.39 0.63 2
CTCHikeExp 5 8995 43.86 29.79 40
CTCHikeOffered 6 8995 40.66 36.06 36
CTCDiff 7 8995 -1.57 19.61 0
JoiningBonus* 8 8995 1.05 0.21 1
Relocate* 9 8995 1.14 0.35 1
Gender* 10 8995 1.83 0.38 2
CandidateSrc* 11 8995 1.89 0.67 2
RexInYrs 12 8995 4.24 2.55 4
LOB* 13 8995 5.18 2.38 5
Location* 14 8995 4.94 3.00 3
Age 15 8995 29.91 4.10 29
Status* 16 8995 1.19 0.39 1
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(renegeHR.df$Status, p = 0.80, list = FALSE)
# 80% training data
trainHRData.df <- renegeHR.df[trainIndex, ]
# 20% testing data
testHRData.df <- renegeHR.df[-trainIndex, ]
# dimension of training data
table(trainHRData.df$Status)
Joined NotJoined
5851 1346
# dimension of testing data
table(testHRData.df$Status)
Joined NotJoined
1462 336
dTreeInfoGain <- train(Status ~ .,
data = trainHRData.df,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
# viasulaziation
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)
# predicted probabilities
predProbTestInfoGain <- predict(dTreeInfoGain, testHRData.df, type = "prob")
# plot of probabilities
plot(predProbTestInfoGain[, 2],
main = "Scatterplot of Probabilities of Renege (Yes / No) (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of Renege (Yes / No)")
# important variables
plot(varImp(dTreeInfoGain,
main = "Important Variables",
scale = TRUE))
# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain,
testHRData.df[, 1:15],
type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testHRData.df$Status,
positive = "NotJoined")
Confusion Matrix and Statistics
Reference
Prediction Joined NotJoined
Joined 1428 269
NotJoined 34 67
Accuracy : 0.8315
95% CI : (0.8134, 0.8485)
No Information Rate : 0.8131
P-Value [Acc > NIR] : 0.02355
Kappa : 0.2411
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.19940
Specificity : 0.97674
Pos Pred Value : 0.66337
Neg Pred Value : 0.84148
Prevalence : 0.18687
Detection Rate : 0.03726
Detection Prevalence : 0.05617
Balanced Accuracy : 0.58807
'Positive' Class : NotJoined
library(ROCR)
lgPredObj <- prediction(predProbTestInfoGain[2],testHRData.df$Status)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
# auc for decision tree
aucLogit <- performance(lgPredObj, measure = "auc")
aucLogit <- aucLogit@y.values[[1]]
aucLogit
[1] 0.7247207