The objective of this case study is to predict customer churn and to understand factors driving the customer attrition.
The dataset has been provided by Analytixlabs and is for Academic purpose only.
dplyr, ggplot2, Hmisc, comprehenr, psych, ROCR
# Importing the data
data = read.csv("D:\\Business Analytics\\R\\CLASS 13 - 14\\Proactive Attrition Management-Logistic Regression Case Study\\Proactive Attrition Management-Logistic Regression Case Study.csv")
dim(data)
## [1] 71047 78
By looking at the data we can conclude that two variables CSA and CUSTOMER does not generate any value, so we will remove it.
sum(duplicated(data))
## [1] 0
# Categorical Features
categorical_feature = to_vec(for(feature in colnames(data)) if(length(unique(data[[feature]]))<26) feature)
for(feature in categorical_feature) {cat(feature,"-",length(unique(data[[feature]])),", ")}
## CHURN - 2 , UNIQSUBS - 15 , ACTVSUBS - 12 , PHONES - 25 , MODELS - 15 , CHILDREN - 2 , CREDITA - 2 , CREDITAA - 2 , CREDITB - 2 , CREDITC - 2 , CREDITDE - 2 , CREDITGY - 2 , CREDITZ - 2 , PRIZMRUR - 2 , PRIZMUB - 2 , PRIZMTWN - 2 , REFURB - 2 , WEBCAP - 2 , TRUCK - 2 , RV - 2 , OCCPROF - 2 , OCCCLER - 2 , OCCCRFT - 2 , OCCSTUD - 2 , OCCHMKR - 2 , OCCRET - 2 , OCCSELF - 2 , OWNRENT - 2 , MARRYUN - 2 , MARRYYES - 2 , MARRYNO - 2 , MAILORD - 2 , MAILRES - 2 , MAILFLAG - 2 , TRAVEL - 2 , PCOWN - 2 , CREDITCD - 2 , RETCALLS - 5 , RETACCPT - 5 , NEWCELLY - 2 , NEWCELLN - 2 , REFER - 13 , INCMISS - 2 , INCOME - 10 , MCYCLE - 2 , CREDITAD - 16 , SETPRCM - 2 , SETPRC - 16 , RETCALL - 2 , CALIBRAT - 2 , CHURNDEP - 3 ,
There are few features like PHONES, CREDITAD and SETPRC which have more than fifteen categories.
# Missing values in Continuous Features
for(feature in continuous_features) {cat(feature,"-",sum(is.na(data[[feature]])),", ")}
## REVENUE - 216 , MOU - 216 , RECCHRGE - 216 , DIRECTAS - 216 , OVERAGE - 216 , ROAM - 216 , CHANGEM - 502 , CHANGER - 502 , DROPVCE - 0 , BLCKVCE - 0 , UNANSVCE - 0 , CUSTCARE - 0 , THREEWAY - 0 , MOUREC - 0 , OUTCALLS - 0 , INCALLS - 0 , PEAKVCE - 0 , OPEAKVCE - 0 , DROPBLK - 0 , CALLFWDV - 0 , CALLWAIT - 0 , MONTHS - 0 , EQPDAYS - 1 , AGE1 - 1244 , AGE2 - 1244 ,
There are few features with missing valus, so we will replace it with something meaningful.
# Missing values in Categorical Features
for(feature in categorical_feature) {cat(feature,"-",sum(is.na(data[[feature]])),", ")}
## CHURN - 0 , UNIQSUBS - 0 , ACTVSUBS - 0 , PHONES - 1 , MODELS - 1 , CHILDREN - 0 , CREDITA - 0 , CREDITAA - 0 , CREDITB - 0 , CREDITC - 0 , CREDITDE - 0 , CREDITGY - 0 , CREDITZ - 0 , PRIZMRUR - 0 , PRIZMUB - 0 , PRIZMTWN - 0 , REFURB - 0 , WEBCAP - 0 , TRUCK - 0 , RV - 0 , OCCPROF - 0 , OCCCLER - 0 , OCCCRFT - 0 , OCCSTUD - 0 , OCCHMKR - 0 , OCCRET - 0 , OCCSELF - 0 , OWNRENT - 0 , MARRYUN - 0 , MARRYYES - 0 , MARRYNO - 0 , MAILORD - 0 , MAILRES - 0 , MAILFLAG - 0 , TRAVEL - 0 , PCOWN - 0 , CREDITCD - 0 , RETCALLS - 0 , RETACCPT - 0 , NEWCELLY - 0 , NEWCELLN - 0 , REFER - 0 , INCMISS - 0 , INCOME - 0 , MCYCLE - 0 , CREDITAD - 0 , SETPRCM - 0 , SETPRC - 0 , RETCALL - 0 , CALIBRAT - 0 , CHURNDEP - 31047 ,
As we can observe that only PHONES and MODELS have one missing value, so we will replace it something meaningful.
These are some of the important information about the continuous features.
## nmiss no_of_0 n outlier_flag1.99% outlier_flag2 mean
## REVENUE 216 8 70831 1 1 58.8539614
## MOU 216 884 70831 1 1 525.7283924
## RECCHRGE 216 209 70831 1 1 46.8764916
## DIRECTAS 216 34105 70831 1 1 0.8948011
## OVERAGE 216 30387 70831 1 1 40.0953598
## ROAM 216 48584 70831 1 1 1.2215262
## std cv min p1.1% p5.5% q1.25% q2.50% q3.75% p95.95%
## REVENUE 44.243613 0.7517525 -6.17 10.00 15.515 33.64 48.53 71.03 135.390
## MOU 530.134259 1.0083805 0.00 0.00 20.415 158.25 366.00 721.75 1580.250
## RECCHRGE 23.915103 0.5101726 -11.29 9.19 10.000 30.00 44.99 59.99 85.000
## DIRECTAS 2.197815 2.4562047 0.00 0.00 0.000 0.00 0.25 0.99 4.210
## OVERAGE 96.347103 2.4029490 0.00 0.00 0.000 0.00 2.50 40.75 190.375
## ROAM 9.081196 7.4343034 0.00 0.00 0.000 0.00 0.00 0.26 5.090
## p99.99% max uc1.99% lc1.1% uc2 lc2
## REVENUE 225.512 1223.38 225.512 10.00 191.584801 -73.876878
## MOU 2450.125 7667.75 2450.125 0.00 2116.131170 -1064.674385
## RECCHRGE 119.990 399.99 119.990 9.19 118.621801 -24.868817
## DIRECTAS 9.650 159.39 9.650 0.00 7.488245 -5.698643
## OVERAGE 427.675 4320.75 427.675 0.00 329.136670 -248.945950
## ROAM 21.557 1112.45 21.557 0.00 28.465115 -26.022062
par(mfrow=c(7,4))
#col_names = colnames(data[continuous_features])
for(feature in continuous_features){
hist(data[[feature]], main = paste("Histogram of",feature),xlab = feature, ylab = "COUNT", col = "aquamarine3",border = "black")
}
From the above histograms we can conclude that all the features are skwed, so we have to transform to make it normaly distributed.
par(mfrow=c(7,4))
col_names = colnames(data[continuous_features])
for(feature in continuous_features){
par(bg="aliceblue")
boxplot(data[feature],ylab=feature, col = "lightblue")
}
From the above boxplots we can conclude that majority of the continuous features have outliers.
These are some of the features with missing values.
# Feature with missing
feature_with_na = to_vec(for(feature in colnames(data)) if(sum(is.na(data[[feature]]))>0) feature)
feature_with_na
## [1] "REVENUE" "MOU" "RECCHRGE" "DIRECTAS" "OVERAGE" "ROAM"
## [7] "CHANGEM" "CHANGER" "PHONES" "MODELS" "EQPDAYS" "AGE1"
## [13] "AGE2" "CHURNDEP"
We will impute the missing values with meadian.
#--Treating Missing Values
data[feature_with_na] = data.frame(apply(data[feature_with_na],2, function(x) impute(x,median)))
There are two feature SETPCR and INCOME in which 0 represents missing.INCOME has 24% missing whereas SETPCR has 56% missing. So we will remove SETPCR and impute INCOME with something meaningful.
data$SETPRC = NULL
data$INCOME[data$INCOME==0] = median(data$INCOME)
This is the user defined function for capping the outliers.
# Treating Outliers
otl_udf = function(x) {
quantiles = quantile(x, c(0.01,0.99),na.rm = T)
x[x < quantiles[1]] = quantiles[1]
x[x > quantiles[2]] = quantiles[2]
x
}
We have capped the outliers at 1 and 95 percentile.
data[continuous_features] = apply(data[continuous_features],2,otl_udf)
cor_mat=round(cor(data[continuous_features]),2)
col <- colorRampPalette(c("darkred","white","darkgreen"))
corrplot(cor_mat,method = "color",col = col(200),tl.cex = 0.7,tl.col="black", addCoef.col = "black", number.cex = 0.7)
From the above Correlation Matrix we can observe that there are few features which have high Multicollinearity. So we will remove those features.
In Factor Analysis we will group variables into different factors and then we will select few variables from each factors in ratio of variables in each factors.
cormat_telco = cor(data)
# Deciding number of factor using scree plot
head(eigen(cormat_telco)$values,12)
## [1] 10.229752 6.206447 3.095276 2.482243 2.182646 2.023611 1.826935
## [8] 1.734867 1.603652 1.540323 1.497265 1.331161
Here we have calculated the correlation matrix and then find the eigen values for each features. These are some of the eigen values.
eigen_values_telco = mutate(data.frame(eigen(cormat_telco)$values),
cum_sum_eigen = cumsum(eigen.cormat_telco..values),
pct_var = eigen.cormat_telco..values/sum(eigen.cormat_telco..values),
cum_pct_var = cum_sum_eigen/sum(eigen.cormat_telco..values))
Now we have calculated eigen values, cumulative sum of eigen values, percent variation and cumulative percent variation.The purpose of calculating these is to understand how many factors we should consider.
head(eigen_values_telco)
## eigen.cormat_telco..values cum_sum_eigen pct_var cum_pct_var
## 1 10.229752 10.22975 0.13639670 0.1363967
## 2 6.206447 16.43620 0.08275263 0.2191493
## 3 3.095276 19.53148 0.04127034 0.2604197
## 4 2.482243 22.01372 0.03309658 0.2935162
## 5 2.182646 24.19636 0.02910195 0.3226182
## 6 2.023611 26.21998 0.02698148 0.3495997
plot(eigen_values_telco$pct_var,type="b")
Here we have ploted the eigen values to have a look at the variation at each number of factors.
factor_analysis_telco = fa(r=cormat_telco,28,rotate = "varimax" ,fm = "pca")
## factor method not specified correctly, minimum residual (unweighted least squares used
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In factor.stats, the correlation matrix is singular, an approximation is used
## In factor.scores, the correlation matrix is singular, an approximation is used
## I was unable to calculate the factor score weights, factor loadings used instead
fa_sort_telco = fa.sort(factor_analysis_telco)
loadings = data.frame(fa_sort_telco$loadings[1:ncol(data),])
We have sorted the loadings at each factor level.
#write.csv(loadings,"loadings_telco.csv",row.names = T)
These vars are the features selected from different factors.
vars = c('PRIZMRUR','MARRYYES','OCCRET','OCCPROF','OVERAGE','CREDITAA','NEWCELLY','CREDITAD','CREDITDE','CREDITC','OCCSELF','TRAVEL','PCOWN','PRIZMTWN','CREDITB','BLCKVCE','CHANGEM','ROAM','REVENUE','MAILRES','MARRYNO','TRUCK','ACTVSUBS','WEBCAP','CREDITA','MONTHS','RETACCPT','RETCALLS','REFURB','PHONES','CREDITGY','CHILDREN','AGE1','INCOME','CREDITCD','OWNRENT','CALLFWDV','REFER','DIRECTAS','THREEWAY','CUSTCARE','DROPVCE','UNANSVCE','OUTCALLS','MOU','OPEAKVCE'
)
x= data[vars]
y = subset(data,select = c(CHURN,CALIBRAT))
data=cbind(scale(as.data.frame(x,center = TRUE,scale = TRUE)),as.data.frame(y))
Now we will split the data into Train and Test. And then we will train the model on train data and validate the model on test data.
#--Train and Test dataset
train = data[data$CALIBRAT==1,]#calibration
test = data[!data$CALIBRAT==1,]#validation
We will create model with all the features.
step = step(fit, direction = "both")
We will create a model by features selected using stepaic
fit2 = glm(CHURN ~ OVERAGE + NEWCELLY + CREDITAD +
CREDITDE + CREDITC + BLCKVCE + CHANGEM +
ROAM + REVENUE + MAILRES + MARRYNO + WEBCAP + MONTHS +
RETACCPT + RETCALLS + REFURB + PHONES + CHILDREN + AGE1 +
INCOME + CREDITCD + THREEWAY + CUSTCARE + DROPVCE + MOU,
data = train, family = binomial(logit))
summary(fit2)
##
## Call:
## glm(formula = CHURN ~ OVERAGE + NEWCELLY + CREDITAD + CREDITDE +
## CREDITC + BLCKVCE + CHANGEM + ROAM + REVENUE + MAILRES +
## MARRYNO + WEBCAP + MONTHS + RETACCPT + RETCALLS + REFURB +
## PHONES + CHILDREN + AGE1 + INCOME + CREDITCD + THREEWAY +
## CUSTCARE + DROPVCE + MOU, family = binomial(logit), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.60709 -1.15321 -0.07534 1.15949 2.16502
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.02523 0.01019 -2.477 0.01323 *
## OVERAGE 0.19511 0.01692 11.532 < 0.0000000000000002 ***
## NEWCELLY -0.02946 0.01025 -2.875 0.00404 **
## CREDITAD -0.02984 0.01183 -2.522 0.01167 *
## CREDITDE -0.12251 0.01118 -10.954 < 0.0000000000000002 ***
## CREDITC -0.05700 0.01052 -5.421 0.000000059316011016 ***
## BLCKVCE 0.02935 0.01135 2.585 0.00973 **
## CHANGEM -0.08308 0.01018 -8.160 0.000000000000000335 ***
## ROAM 0.07176 0.01082 6.632 0.000000000033058400 ***
## REVENUE -0.05712 0.02059 -2.775 0.00553 **
## MAILRES -0.06431 0.01267 -5.075 0.000000388408700425 ***
## MARRYNO -0.02806 0.01101 -2.548 0.01083 *
## WEBCAP -0.10130 0.01058 -9.577 < 0.0000000000000002 ***
## MONTHS 0.03409 0.01250 2.726 0.00640 **
## RETACCPT -0.04154 0.01477 -2.813 0.00491 **
## RETCALLS 0.16636 0.01487 11.188 < 0.0000000000000002 ***
## REFURB 0.07618 0.01050 7.255 0.000000000000400814 ***
## PHONES -0.07753 0.01290 -6.009 0.000000001871971870 ***
## CHILDREN 0.03396 0.01135 2.991 0.00278 **
## AGE1 -0.09956 0.01453 -6.851 0.000000000007338710 ***
## INCOME -0.02720 0.01100 -2.474 0.01337 *
## CREDITCD 0.02615 0.01464 1.786 0.07408 .
## THREEWAY -0.03832 0.01188 -3.225 0.00126 **
## CUSTCARE -0.02484 0.01243 -1.998 0.04575 *
## DROPVCE 0.07517 0.01362 5.517 0.000000034411711609 ***
## MOU -0.20553 0.01870 -10.988 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 55452 on 39999 degrees of freedom
## Residual deviance: 54235 on 39974 degrees of freedom
## AIC: 54287
##
## Number of Fisher Scoring iterations: 4
#--validation (training)
train1 = cbind(train,PROB = predict(fit2,type = "response"))
#--creating decile analysis
decLocation = quantile(train1$PROB, probs = seq(0.1,0.9,by = 0.1))
train1$decile = findInterval(train1$PROB, c(-Inf, decLocation,Inf))
#--decile analysis report
decile_summ_train=train1%>%
group_by(decile)%>%
summarise(total_cnt = n(),min_prob = min(PROB), max_prob = max(PROB),
churn_cnt = sum(CHURN), nonchurn_cnt = total_cnt - churn_cnt)
decile_summ_train = arrange(decile_summ_train,desc(decile))
## # A tibble: 10 x 14
## decile total_cnt min_prob max_prob churn_cnt nonchurn_cnt bad_rate cumm_bad
## <int> <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 10 4000 0.605 0.967 2664 1336 13.3 13.3
## 2 9 4000 0.562 0.605 2427 1573 12.1 25.4
## 3 8 4000 0.537 0.562 2202 1798 11.0 36.5
## 4 7 4000 0.515 0.537 2153 1847 10.8 47.2
## 5 6 4000 0.498 0.515 1981 2019 9.9 57.1
## 6 5 4000 0.481 0.498 1937 2063 9.69 66.8
## 7 4 4000 0.462 0.481 1864 2136 9.32 76.1
## 8 3 4000 0.437 0.462 1730 2270 8.65 84.8
## 9 2 4000 0.397 0.437 1624 2376 8.12 92.9
## 10 1 4000 0.0960 0.397 1418 2582 7.09 100.
## # ... with 6 more variables: good_rate <dbl>, cumm_good <dbl>, ks <dbl>,
## # random_model <dbl>, lift <dbl>, baseline <dbl>
#--validation (testing)
test1 = cbind(test,PROB = predict(fit2,test,type = "response"))
#--creating decile analysis
decLocation_t = quantile(test1$PROB, probs = seq(0.1,0.9,by = 0.1))
test1$decile = findInterval(test1$PROB, c(-Inf, decLocation_t,Inf))
#--decile analysis report
decile_summ_test=test1%>%
group_by(decile)%>%
summarise(total_cnt = n(),min_prob = min(PROB), max_prob = max(PROB),
churn_cnt = sum(CHURN), nonchurn_cnt = total_cnt - churn_cnt)
decile_summ_test = arrange(decile_summ_test,desc(decile))
## # A tibble: 10 x 14
## decile total_cnt min_prob max_prob churn_cnt nonchurn_cnt bad_rate cumm_bad
## <int> <int> <dbl> <dbl> <int> <int> <dbl> <dbl>
## 1 10 3105 0.585 0.886 93 3012 15.3 15.3
## 2 9 3105 0.546 0.585 97 3008 15.9 31.2
## 3 8 3104 0.522 0.546 78 3026 12.8 44.0
## 4 7 3105 0.503 0.522 72 3033 11.8 55.8
## 5 6 3105 0.486 0.503 58 3047 9.52 65.4
## 6 5 3104 0.469 0.486 55 3049 9.03 74.4
## 7 4 3105 0.450 0.469 48 3057 7.88 82.3
## 8 3 3104 0.424 0.450 44 3060 7.22 89.5
## 9 2 3105 0.383 0.424 38 3067 6.24 95.7
## 10 1 3105 0.0936 0.383 26 3079 4.27 100.
## # ... with 6 more variables: good_rate <dbl>, cumm_good <dbl>, ks <dbl>,
## # random_model <dbl>, lift <dbl>, baseline <dbl>
# Confusion matrix
table(train1$PROB>0.51, train1$CHURN) #training(calibration)
##
## 0 1
## FALSE 12910 9933
## TRUE 7090 10067
table(test1$PROB>0.51, test1$CHURN) #testing(validation)
##
## 0 1
## FALSE 19488 291
## TRUE 10950 318
pred_train_fit2 = prediction(train1$PROB,train1$CHURN)
perf_fit2 = performance(pred_train_fit2,"tpr","fpr")
plot(perf_fit2)
abline(0,1)
performance(pred_train_fit2,"auc")@y.values
## [[1]]
## [1] 0.6018469
pred_test_fit2 = prediction(test1$PROB,test1$CHURN)
perf_fit2 = performance(pred_test_fit2,"tpr","fpr")
plot(perf_fit2)
abline(0,1)