The dataset comes from a CRM system holding demographic, financial, product and communication information on customers. It also records whether or not the customer churned. In order to be included in this dataset a customer had to have signed up and paid for at least one month of service. A record covers a customer’s tenure period - a new record will be logged if they are won back.
sapply(telecom, class)
## customerID gender SeniorCitizen Partner
## "character" "character" "numeric" "character"
## Dependents tenure PhoneService MultipleLines
## "character" "numeric" "character" "character"
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## "character" "character" "character" "character"
## TechSupport StreamingTV StreamingMovies Contract
## "character" "character" "character" "character"
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## "character" "character" "numeric" "numeric"
## numAdminTickets numTechTickets Churn
## "numeric" "numeric" "character"
#very few explicit NA values
sapply(telecom, function(x) sum(is.na(x)))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## numAdminTickets numTechTickets Churn
## 0 0 0
#number of levels per category: all are reasonable
telecom |>
select(!customerID) |>
select_if(is.character) |>
mutate_if(is.character,as.factor) |>
summarise_all(~length(levels(.))) |>
print(width=Inf)
## # A tibble: 1 × 16
## gender Partner Dependents PhoneService MultipleLines InternetService
## <int> <int> <int> <int> <int> <int>
## 1 2 2 2 2 3 3
## OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV
## <int> <int> <int> <int> <int>
## 1 3 3 3 3 3
## StreamingMovies Contract PaperlessBilling PaymentMethod Churn
## <int> <int> <int> <int> <int>
## 1 3 3 2 4 2
First mutation
telecom |>
select(!customerID) |>
mutate(across(where(is.character),as.factor)) -> telecom.f
length(levels(as.factor(c('one','two'))))
## [1] 2
numAdminTickets
= heavily skewed; very few accounts
raise admin tickets; should check distribution of the response against
numAdminTikets = 0 and numAdminTikets > 0
numTechTickets
: conduct same analysis as for
numAdminTikets =, as the variable is also heavily skewed
telecom.f |>
mutate(numAdminTickets.s = ifelse(numAdminTickets>0, 'greater', 'zero')) |>
mutate(numTechTickets.f = ifelse(numTechTickets>0, 'greater','zero')) -> telecom.f
#there is little difference in splits between numAminTickets = 0 and when > 0; this field is a candidate for removal
telecom.f |>
ggplot(aes(numAdminTickets.s, fill = Churn)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent)
#there is significant difference in Churn proportion depending on this variable... but is each bin created equally?
telecom.f |>
ggplot(aes(numTechTickets.f, fill = Churn)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent)
Data mutation: binning the variable
telecom.f |>
mutate(numTechTickets.bin = cut(numTechTickets, breaks = c(0,0.5,3,6,9), include.lowest=T)) -> telecom.f
levels(telecom.f$numTechTickets.bin) <- c('0','>=3','3<x<=6','6<x<=9')
telecom.f |>
ggplot(aes(x=numTechTickets.bin, fill=Churn))+
geom_bar(position="fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Yes|No churn split on Number of Tech Tickets opened")
table(telecom.f$numTechTickets.bin)
##
## 0 >=3 3<x<=6 6<x<=9
## 6073 608 321 41
#whittle down the dataframe
telecom.f |>
select(!c(numTechTickets,numTechTickets.f,numAdminTickets.s,numAdminTickets)) |>
filter(!is.na(TotalCharges))-> telecom.f
Convert SeniorCitizen to categorical
telecom.f |>
mutate(seniorCitizen.f = as.factor(SeniorCitizen)) |>
select(-SeniorCitizen) -> telecom.f
sort(names(telecom.f))
## [1] "Churn" "Contract" "Dependents"
## [4] "DeviceProtection" "gender" "InternetService"
## [7] "MonthlyCharges" "MultipleLines" "numTechTickets.bin"
## [10] "OnlineBackup" "OnlineSecurity" "PaperlessBilling"
## [13] "Partner" "PaymentMethod" "PhoneService"
## [16] "seniorCitizen.f" "StreamingMovies" "StreamingTV"
## [19] "TechSupport" "tenure" "TotalCharges"
Identify those records that are below the 25th percentile by the IQR (IQR is a single number)
prop_outside <-function(vec){
prop_under = sum(vec < (quantile(vec, .25, na.rm=T) - 1.5*stats::IQR(vec, na.rm=T)))/length(vec)
prop_ovr = sum(vec > (quantile(vec, .75) + 1.5*stats::IQR(vec)))/length(vec)
return(prop_under+prop_ovr)
}
#apply across all numeric variables and print the output
#this proves that there are no outliers to worry about
telecom.f %>%
select_if(is.numeric) %>%
pivot_longer(cols=everything(), names_to='variables', values_to='value') %>%
group_by(variables) %>%
summarize(proportion_outliers = prop_outside(value))
## # A tibble: 3 × 2
## variables proportion_outliers
## <chr> <dbl>
## 1 MonthlyCharges 0
## 2 TotalCharges 0
## 3 tenure 0
Numeric variables generally are highly correlated, which makes sense
telecom.f %>%
keep(is.numeric) %>%
cor() %>%
corrplot(type = "upper")
Plot tenure against TotalCharges to verify that TotalCharges is just cumulative Tenure is discrete and not continuous
telecom.f |>
ggplot(aes(x=tenure,y=TotalCharges)) +
geom_point() +
labs(title="Linear relationship between tenure and TotalCharges")
#I'll select tenure, and drop TotalCharges
telecom.f |>
select(-TotalCharges) -> telecom.f
#plot the balance of the dataset: split is 73/27; ROSE may help
Split of the response variable
table(telecom.f$Churn)/length(telecom.f$Churn)
##
## No Yes
## 0.734215 0.265785
#no preprocessing necessary for random forest
train_ix <- caret::createDataPartition(telecom.f$Churn, p=.8,list=FALSE,times=1)
telecom.f_train <- telecom.f[train_ix,]
telecom.f_test <- telecom.f[-train_ix,]
#creating a balanced dataset
#ROSE will oversample the minority cases (Churn = Yes)
table(telecom.f_train$Churn) # shoot for x2 the "No" cases
##
## No Yes
## 4131 1496
no_cases <- nrow(telecom.f_train[telecom.f_train$Churn=="No",])
rose_train <- ROSE(Churn ~ ., data = telecom.f_train, no_cases*2, p=0.5)$data
#prove class balance
table(rose_train$Churn)
##
## No Yes
## 4087 4175
rf_dflt <- randomForest(
Churn ~ .,
data = telecom.f_train,
#ntree = 100,
#mtry = 6,
#na.action = na.roughfix
)
rf_balanced <- randomForest(
Churn ~ .,
data = rose_train
#ntree = 100,
#mtry = 6,
#na.action = na.roughfix
)
#a 2D matrix
rf_dflt_prob <- predict(rf_dflt, telecom.f_test, type = "prob")
dim(rf_dflt_prob)
## [1] 1405 2
#a factor vector
rf_predict <- predict(rf_dflt, telecom.f_test, type="class")
rf_predict_bal <- predict(rf_balanced, telecom.f_test, type="class")
cm_rf_dflt <- confusionMatrix(rf_predict, telecom.f_test$Churn, positive = "Yes")
cm_rf_bal <- confusionMatrix(rf_predict_bal, telecom.f_test$Churn, positive = "Yes")
cm_rf_dflt$byClass
## Sensitivity Specificity Pos Pred Value
## 0.7184987 0.9273256 0.7813411
## Neg Pred Value Precision Recall
## 0.9011299 0.7813411 0.7184987
## F1 Prevalence Detection Rate
## 0.7486034 0.2654804 0.1907473
## Detection Prevalence Balanced Accuracy
## 0.2441281 0.8229121
cm_rf_bal$byClass
## Sensitivity Specificity Pos Pred Value
## 0.8954424 0.8246124 0.6485437
## Neg Pred Value Precision Recall
## 0.9561798 0.6485437 0.8954424
## F1 Prevalence Detection Rate
## 0.7522523 0.2654804 0.2377224
## Detection Prevalence Balanced Accuracy
## 0.3665480 0.8600274
Variable importance
vim.mat.1 <- randomForest::importance(rf_balanced,type=1)
vim.mat.2 <- randomForest::importance(rf_balanced,type=2)
vim.mat.2[order(vim.mat.2[, 1], decreasing=TRUE), ]
## tenure numTechTickets.bin Contract MonthlyCharges
## 673.86095 564.30363 462.91216 376.79826
## PaymentMethod OnlineSecurity TechSupport InternetService
## 206.81881 193.16346 175.19971 133.79516
## OnlineBackup DeviceProtection StreamingTV StreamingMovies
## 120.19357 96.94393 85.08181 80.47320
## gender MultipleLines PaperlessBilling Partner
## 78.57071 77.95508 68.47034 68.35103
## Dependents seniorCitizen.f PhoneService
## 56.64703 52.43403 17.77561
dim(vim.mat.2)
## [1] 19 1
as.data.frame(vim.mat.2) -> df
df$vars <- rownames(df)
df |>
arrange(desc(MeanDecreaseGini)) -> df
df|>
ggplot(aes(x=reorder(vars, MeanDecreaseGini),y=MeanDecreaseGini)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variable Importance")
Fit a decision tree in order to identify some key thresholds
rose_train |>
select(tenure, numTechTickets.bin, Contract, MonthlyCharges, PaymentMethod, OnlineSecurity, TechSupport, InternetService, OnlineBackup, DeviceProtection) -> rose.train.k
tree_mod <-
rpart::rpart(
Churn ~ .,
method = "class",
xval = 5,
data = rose_train,
#original dataset has no NAs
#na.action = na.roughfix
)
#model evaluation
#default_pred <- predict(tree_mod,test_set,type="class")
dt_predict_bal <- predict(tree_mod, telecom.f_test, type="class")
cm <- confusionMatrix(dt_predict_bal, telecom.f_test$Churn, positive = "Yes")
cm$byClass
## Sensitivity Specificity Pos Pred Value
## 0.8981233 0.7567829 0.5716724
## Neg Pred Value Precision Recall
## 0.9536020 0.5716724 0.8981233
## F1 Prevalence Detection Rate
## 0.6986444 0.2654804 0.2384342
## Detection Prevalence Balanced Accuracy
## 0.4170819 0.8274531
rpart.plot::rpart.plot(tree_mod, main="Balanced Dataset Decision Tree Model")
Manual preprocessing for keras
sapply(telecom.f,class)
## gender Partner Dependents tenure
## "factor" "factor" "factor" "numeric"
## PhoneService MultipleLines InternetService OnlineSecurity
## "factor" "factor" "factor" "factor"
## OnlineBackup DeviceProtection TechSupport StreamingTV
## "factor" "factor" "factor" "factor"
## StreamingMovies Contract PaperlessBilling PaymentMethod
## "factor" "factor" "factor" "factor"
## MonthlyCharges Churn numTechTickets.bin seniorCitizen.f
## "numeric" "factor" "factor" "factor"
test.dummy <- caret::dummyVars( ~. , telecom.f, sep='_',fullRank=TRUE) #Churn will be excluded from new list
test.dummy #provides a summary
## Dummy Variable Object
##
## Formula: ~.
## 20 variables, 18 factors
## Variables and levels will be separated by '_'
## A full rank encoding is used
df <- predict(test.dummy,telecom.f)
df <- data.frame(df)
names(df)
## [1] "gender_Male"
## [2] "Partner_Yes"
## [3] "Dependents_Yes"
## [4] "tenure"
## [5] "PhoneService_Yes"
## [6] "MultipleLines_No.phone.service"
## [7] "MultipleLines_Yes"
## [8] "InternetService_Fiber.optic"
## [9] "InternetService_No"
## [10] "OnlineSecurity_No.internet.service"
## [11] "OnlineSecurity_Yes"
## [12] "OnlineBackup_No.internet.service"
## [13] "OnlineBackup_Yes"
## [14] "DeviceProtection_No.internet.service"
## [15] "DeviceProtection_Yes"
## [16] "TechSupport_No.internet.service"
## [17] "TechSupport_Yes"
## [18] "StreamingTV_No.internet.service"
## [19] "StreamingTV_Yes"
## [20] "StreamingMovies_No.internet.service"
## [21] "StreamingMovies_Yes"
## [22] "Contract_One.year"
## [23] "Contract_Two.year"
## [24] "PaperlessBilling_Yes"
## [25] "PaymentMethod_Credit.card..automatic."
## [26] "PaymentMethod_Electronic.check"
## [27] "PaymentMethod_Mailed.check"
## [28] "MonthlyCharges"
## [29] "Churn_Yes"
## [30] "numTechTickets.bin_..3"
## [31] "numTechTickets.bin_3.x..6"
## [32] "numTechTickets.bin_6.x..9"
## [33] "seniorCitizen.f_1"
df_ix <- caret::createDataPartition(df$Churn_Yes, p=.8,list=FALSE,times=1)
#scale/normalize the test and training set separately
df_train <- df[df_ix,]
df_test <- df[-df_ix,]
#scale numeric values from test & train
num.cols <- names(telecom.f)[sapply(telecom.f,is.numeric)]
#scale numeric values separately
train.scale <- scale(df_train[num.cols])
test.scale <- scale(df_test[num.cols])
#make sure it worked by printing the mean and var of the numeric columns
as.data.frame(train.scale) |>
summarize(across(everything(),c(mean,var)))
## tenure_1 tenure_2 MonthlyCharges_1 MonthlyCharges_2
## 1 3.637608e-17 1 -1.868858e-16 1
#remove Churn column from both test and train and convert to binary matrix
#y_test <- to_categorical(g_test, 10)
y_train <- to_categorical(df_train$Churn_Yes)
y_test <- to_categorical(df_test$Churn_Yes)
df_train |>
select(-any_of(c(num.cols,'Churn_Yes'))) |>
cbind(train.scale) -> df_train.fin
df_test |>
select(-any_of(c(num.cols,'Churn_Yes'))) |>
cbind(test.scale) -> df_test.fin
#take a look at the final results: all the columns of the data.frame
sort(names(df_train.fin))
## [1] "Contract_One.year"
## [2] "Contract_Two.year"
## [3] "Dependents_Yes"
## [4] "DeviceProtection_No.internet.service"
## [5] "DeviceProtection_Yes"
## [6] "gender_Male"
## [7] "InternetService_Fiber.optic"
## [8] "InternetService_No"
## [9] "MonthlyCharges"
## [10] "MultipleLines_No.phone.service"
## [11] "MultipleLines_Yes"
## [12] "numTechTickets.bin_..3"
## [13] "numTechTickets.bin_3.x..6"
## [14] "numTechTickets.bin_6.x..9"
## [15] "OnlineBackup_No.internet.service"
## [16] "OnlineBackup_Yes"
## [17] "OnlineSecurity_No.internet.service"
## [18] "OnlineSecurity_Yes"
## [19] "PaperlessBilling_Yes"
## [20] "Partner_Yes"
## [21] "PaymentMethod_Credit.card..automatic."
## [22] "PaymentMethod_Electronic.check"
## [23] "PaymentMethod_Mailed.check"
## [24] "PhoneService_Yes"
## [25] "seniorCitizen.f_1"
## [26] "StreamingMovies_No.internet.service"
## [27] "StreamingMovies_Yes"
## [28] "StreamingTV_No.internet.service"
## [29] "StreamingTV_Yes"
## [30] "TechSupport_No.internet.service"
## [31] "TechSupport_Yes"
## [32] "tenure"
Build the Neural Net Model: a sequential model with backward propogation with multiple hidden layers. Learning rate is on an exponential decay schedule in order to avoid overfitting. The problem statement set to binary crossentropy - adequate for binary classification problems.
mat.train <- as.matrix(df_train.fin)
mat.test <- as.matrix(df_test.fin)
#build model architecture
model <- keras_model_sequential()
model %>%
layer_dense(units = 256, activation = 'relu', input_shape = c(ncol(mat.train)), name ="input_layer") %>%
layer_dropout(rate = 0.4) %>%
layer_dense(units = 128, activation = 'relu', name = 'first_hidden') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 64, activation = 'relu', name = 'second_hidden') %>%
layer_dropout(rate = 0.2) %>%
layer_dense(units = 2, activation = 'sigmoid',name = 'output_layer') #this activation funct can be changed
Adjusting the learning decay
## Epoch 1/10
## 90/90 - 1s - loss: 0.4155 - accuracy: 0.8062 - val_loss: 0.3503 - val_accuracy: 0.8526 - 662ms/epoch - 7ms/step
## Epoch 2/10
## 90/90 - 0s - loss: 0.3578 - accuracy: 0.8287 - val_loss: 0.3173 - val_accuracy: 0.8597 - 132ms/epoch - 1ms/step
## Epoch 3/10
## 90/90 - 0s - loss: 0.3412 - accuracy: 0.8407 - val_loss: 0.3299 - val_accuracy: 0.8570 - 131ms/epoch - 1ms/step
## Epoch 4/10
## 90/90 - 0s - loss: 0.3375 - accuracy: 0.8442 - val_loss: 0.3271 - val_accuracy: 0.8632 - 132ms/epoch - 1ms/step
## Epoch 5/10
## 90/90 - 0s - loss: 0.3277 - accuracy: 0.8451 - val_loss: 0.3104 - val_accuracy: 0.8597 - 124ms/epoch - 1ms/step
## Epoch 6/10
## 90/90 - 0s - loss: 0.3325 - accuracy: 0.8431 - val_loss: 0.3095 - val_accuracy: 0.8641 - 118ms/epoch - 1ms/step
## Epoch 7/10
## 90/90 - 0s - loss: 0.3304 - accuracy: 0.8462 - val_loss: 0.3079 - val_accuracy: 0.8641 - 115ms/epoch - 1ms/step
## Epoch 8/10
## 90/90 - 0s - loss: 0.3286 - accuracy: 0.8502 - val_loss: 0.3123 - val_accuracy: 0.8659 - 127ms/epoch - 1ms/step
## Epoch 9/10
## 90/90 - 0s - loss: 0.3218 - accuracy: 0.8491 - val_loss: 0.3057 - val_accuracy: 0.8632 - 114ms/epoch - 1ms/step
## Epoch 10/10
## 90/90 - 0s - loss: 0.3288 - accuracy: 0.8502 - val_loss: 0.3282 - val_accuracy: 0.8641 - 133ms/epoch - 1ms/step
Outputs the best/avg loss and accuracy
test_results <- model %>% evaluate(mat.test, y_test)
## 44/44 - 0s - loss: 0.3169 - accuracy: 0.8642 - 39ms/epoch - 893us/step
print(test_results)
## loss accuracy
## 0.3169046 0.8641536