The objective is to assemble a machine algorithm that may predict if a customer churns (Y/N). Data is imported from the telecom dataset.
#Explovative Analysis
Exploration of dementions and names.
dim_desc(Telecom)
## [1] "[1,200 x 12]"
names(Telecom)
## [1] "X1" "Customer.ID" "Years.customer"
## [4] "Months.customer" "Minutes.in.2018" "Minutes.onnet"
## [7] "Minutes.offnet" "Number.of.SMS" "KBs.used"
## [10] "Total.Unique.Calls" "Previous.provider" "Churn.Status"
Glimpsing at the feature details.
glimpse(head(Telecom, n=1))
## Rows: 1
## Columns: 12
## $ X1 <dbl> 1
## $ Customer.ID <chr> "ADF1259"
## $ Years.customer <dbl> 3805
## $ Months.customer <dbl> 126.83
## $ Minutes.in.2018 <dbl> 4091.616
## $ Minutes.onnet <dbl> 1436.325
## $ Minutes.offnet <dbl> 2655.291
## $ Number.of.SMS <dbl> 81
## $ KBs.used <dbl> 3624.375
## $ Total.Unique.Calls <dbl> 117
## $ Previous.provider <chr> "KPN"
## $ Churn.Status <dbl> 0
#Cleaning Data
Mutate variable if character or a factor.
df <- Telecom %>% mutate_if(is.character, as.factor)
Inspecting missing values (NA).
df %>% map(~ sum(is.na(.)))
## $X1
## [1] 0
##
## $Customer.ID
## [1] 0
##
## $Years.customer
## [1] 0
##
## $Months.customer
## [1] 0
##
## $Minutes.in.2018
## [1] 0
##
## $Minutes.onnet
## [1] 0
##
## $Minutes.offnet
## [1] 0
##
## $Number.of.SMS
## [1] 0
##
## $KBs.used
## [1] 0
##
## $Total.Unique.Calls
## [1] 0
##
## $Previous.provider
## [1] 6
##
## $Churn.Status
## [1] 0
df$Previous.provider has 6 missing values, therefore it will be removed. Unique Customer.ID is not an usefull feature henche this will also be removed.
df <- df %>% select(-Customer.ID, -Previous.provider)
head(df)
## # A tibble: 6 x 10
## X1 Years.customer Months.customer Minutes.in.2018 Minutes.onnet
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3805 127. 4092. 1436.
## 2 2 2905 96.8 3179. 1950.
## 3 3 3819 127. 7233. 949.
## 4 4 3643 121. 7336. 1765.
## 5 5 3809 127. 5391. 3519.
## 6 6 2260 75.3 901. 160.
## # … with 5 more variables: Minutes.offnet <dbl>, Number.of.SMS <dbl>,
## # KBs.used <dbl>, Total.Unique.Calls <dbl>, Churn.Status <dbl>
Changing Churn.Status values from 0/1 to No/Yes.
df$Churn.Status <- as.factor(mapvalues(df$Churn.Status,
from=c("0","1"),
to=c("No", "Yes")))
head(df)
## # A tibble: 6 x 10
## X1 Years.customer Months.customer Minutes.in.2018 Minutes.onnet
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 3805 127. 4092. 1436.
## 2 2 2905 96.8 3179. 1950.
## 3 3 3819 127. 7233. 949.
## 4 4 3643 121. 7336. 1765.
## 5 5 3809 127. 5391. 3519.
## 6 6 2260 75.3 901. 160.
## # … with 5 more variables: Minutes.offnet <dbl>, Number.of.SMS <dbl>,
## # KBs.used <dbl>, Total.Unique.Calls <dbl>, Churn.Status <fct>
#Partitioning
Splitting data for cross validation. Creating 75% train / 25% test.
set.seed(1)
inTrain <- createDataPartition(y = df$Churn.Status, p=0.75, list=FALSE)
train <- df[inTrain,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
test <- df[-inTrain,]
#GLM
Fitting GLM model.
fit <- glm(Churn.Status~., data=train, family=binomial)
fit
##
## Call: glm(formula = Churn.Status ~ ., family = binomial, data = train)
##
## Coefficients:
## (Intercept) X1 Years.customer Months.customer
## 4.196e-01 3.383e-06 3.889e-03 -1.212e-01
## Minutes.in.2018 Minutes.onnet Minutes.offnet Number.of.SMS
## -3.699e+05 3.699e+05 3.699e+05 5.396e-03
## KBs.used Total.Unique.Calls
## -5.535e-08 -1.268e-03
##
## Degrees of Freedom: 900 Total (i.e. Null); 891 Residual
## Null Deviance: 1249
## Residual Deviance: 1150 AIC: 1170
Making predictions. Convert character responses to factor types. This is so that the ecoding is correct for the logistic regression model.
set.seed(1)
churn.probs <- predict(fit, test, type="response")
head(churn.probs, n = 10)
## 1 2 3 4 5 6 7
## 0.52975201 0.43020542 0.39418607 0.27744830 0.59813793 0.23279180 0.62859986
## 8 9 10
## 0.04271687 0.56925716 0.60507800
glm.pred = rep("No", length(churn.probs))
glm.pred[churn.probs > 0.4] = "Yes" # set threshold (0/1)
glm.pred <- as.factor(glm.pred)
head(glm.pred, n=5)
## [1] Yes Yes No No Yes
## Levels: No Yes
Evaluating the model by confusion matrix.
confusionMatrix(glm.pred, test$Churn.Status, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 64 14
## Yes 88 133
##
## Accuracy : 0.6589
## 95% CI : (0.6021, 0.7125)
## No Information Rate : 0.5084
## P-Value [Acc > NIR] : 1.028e-07
##
## Kappa : 0.3231
##
## Mcnemar's Test P-Value : 4.899e-13
##
## Sensitivity : 0.9048
## Specificity : 0.4211
## Pos Pred Value : 0.6018
## Neg Pred Value : 0.8205
## Prevalence : 0.4916
## Detection Rate : 0.4448
## Detection Prevalence : 0.7391
## Balanced Accuracy : 0.6629
##
## 'Positive' Class : Yes
##
Plotting ROC curve.
#create prediction object
pr <- prediction(churn.probs, test$Churn.Status)
#plot curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
# AUC value
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7188955
#Decision Three
Training model and plot results.
set.seed(35)
# Train model
tree.model <- rpart(Churn.Status ~ .,
data = train,
method = "class",
control = rpart.control(xval = 10))
# Plot
rpart.plot(tree.model)
Evaluation model metrics.
tree.pred <- predict(tree.model, newdata = test, type = "class")
tree.result <- confusionMatrix(data = tree.pred, test$Churn.Status)
tree.result
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 128 37
## Yes 24 110
##
## Accuracy : 0.796
## 95% CI : (0.7458, 0.8402)
## No Information Rate : 0.5084
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5913
##
## Mcnemar's Test P-Value : 0.1244
##
## Sensitivity : 0.8421
## Specificity : 0.7483
## Pos Pred Value : 0.7758
## Neg Pred Value : 0.8209
## Prevalence : 0.5084
## Detection Rate : 0.4281
## Detection Prevalence : 0.5518
## Balanced Accuracy : 0.7952
##
## 'Positive' Class : No
##
tree.precision <- tree.result$byClass['Pos Pred Value']
tree.precision
## Pos Pred Value
## 0.7757576
tree.recall <- tree.result$byClass['Sensitivity']
tree.recall
## Sensitivity
## 0.8421053
tree.F1 <- tree.result$byClass['F1']
tree.F1
## F1
## 0.807571
#Random Forest
Train model with 200 trees.
set.seed(7)
# Train model
forest.model <- randomForest(Churn.Status ~.,
data = train,
ntree=200, # set number of trees
type="classification")
# See error reduction with number of trees ( not much gained beyond ~25 trees)
plot(forest.model)
Plotting features random forest.
varImpPlot(forest.model, sort = T, main="Variable Importance")
Evaluation model metrics.
forest.pred <- predict(forest.model, newdata = test, type = "class")
forest.result <- confusionMatrix(data = forest.pred, test$Churn.Status)
forest.result
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 132 30
## Yes 20 117
##
## Accuracy : 0.8328
## 95% CI : (0.7856, 0.8733)
## No Information Rate : 0.5084
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6651
##
## Mcnemar's Test P-Value : 0.2031
##
## Sensitivity : 0.8684
## Specificity : 0.7959
## Pos Pred Value : 0.8148
## Neg Pred Value : 0.8540
## Prevalence : 0.5084
## Detection Rate : 0.4415
## Detection Prevalence : 0.5418
## Balanced Accuracy : 0.8322
##
## 'Positive' Class : No
##
forest.precision <- forest.result$byClass['Pos Pred Value']
forest.precision
## Pos Pred Value
## 0.8148148
forest.recall <- forest.result$byClass['Sensitivity']
forest.recall
## Sensitivity
## 0.8684211
forest.F1 <- forest.result$byClass['F1']
forest.F1
## F1
## 0.8407643