This data challenge is a part of case study done for an insurance company. The data consisted of two sets: 2017 policies data and fresh 2018 data. Both the sets were at the individual quote level and contained details about the customer demographic, vehicle, traffic condition and the claims made in the past. It had 60392 observations and 9 variables. The objectives of the challenge were as follows:
The objectives are centered at marketing campaigns engaging future 2018 customers based on their risk profiles.
library(readr)
library(ggplot2)
library(lubridate)
library(dplyr)
library(VIM)
library(mice)
library(corrplot)
library(randomForest)
library(Matrix)
library(ROCR)
library(xgboost)
library(caret)
library(car)
library(cluster)
library(Rtsne)
library(RODBC)
library(dplyr)
library(kableExtra)
library(Metrics) # For calculating auc
library(GGally)
raw_data_2017 <- suppressMessages(as.data.frame(read_csv("./auto_policies_2017.csv")))
numclaims<-raw_data_2017$numclaims
claimcst0<-raw_data_2017$claimcst0
raw_data_2018 <- suppressMessages(as.data.frame(read_csv("./auto_potential_customers_2018.csv")))
quote_number<-raw_data_2018$quote_number
The data had many missing values for:
Proportion of missing values for each feature and combination of features was best understood from the plot below:
| pol_number | pol_eff_dt | gender | agecat | date_of_birth | credit_score | area | traffic_index | veh_age | veh_body | veh_value | numclaims | claimcst0 | annual_premium | age |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64080188 | 7/10/2017 | M | 2 | 1989-01-04 | 631 | B | 140.9 | 4 | TRUCK | 0.924 | 0 | 0.0000 | 716.53 | 29 |
| 18917133 | 7/31/2017 | M | 2 | 1985-06-21 | 531 | C | 136.5 | 3 | HBACK | 1.430 | 1 | 583.0109 | 716.53 | 33 |
| 82742606 | 2/1/2017 | M | 6 | 1942-07-25 | 838 | D | 88.8 | 3 | SEDAN | 1.100 | 1 | 159.3758 | 716.53 | 76 |
| 43601997 | 10/17/2017 | M | 5 | 1959-06-08 | 835 | E | NA | 2 | SEDAN | 2.090 | 0 | 0.0000 | 716.53 | 59 |
| 58746861 | 4/13/2017 | F | 4 | 1967-05-16 | 748 | C | 123.0 | 3 | HBACK | 0.803 | 1 | 143.5556 | 716.53 | 51 |
| 83346346 | 11/23/2017 | M | 5 | 1956-11-02 | 785 | B | 108.6 | 2 | SEDAN | 1.903 | 0 | 0.0000 | 716.53 | 61 |
| 92111059 | 11/29/2017 | M | 3 | 1976-10-31 | 759 | E | 75.0 | 4 | STNWG | 1.452 | 0 | 0.0000 | 716.53 | 41 |
| 69967688 | 4/24/2017 | F | NA | 1947-11-27 | 836 | C | 88.5 | 1 | HBACK | 1.397 | 0 | 0.0000 | 716.53 | 70 |
| 58856161 | 7/19/2017 | M | NA | 1967-03-12 | 688 | A | 50.0 | 1 | SEDAN | 2.838 | 0 | 0.0000 | 716.53 | 51 |
| 57786319 | 8/31/2017 | M | NA | 1984-12-25 | 503 | B | NA | 3 | HBACK | 1.936 | 1 | 1039.3983 | 716.53 | 33 |
| 42491309 | 3/24/2017 | F | 3 | 1974-10-08 | 830 | D | 132.2 | 4 | STNWG | 1.485 | 0 | 0.0000 | 716.53 | 43 |
| 66944935 | 7/20/2017 | F | 2 | 1982-08-31 | 649 | E | 27.0 | 2 | STNWG | 3.641 | 0 | 0.0000 | 716.53 | 35 |
| 66147210 | 5/24/2017 | M | 4 | 1968-03-30 | 424 | D | 84.2 | 4 | TRUCK | 1.595 | 1 | 5087.3893 | 716.53 | 50 |
| 73105575 | 9/1/2017 | M | NA | 1969-08-28 | 792 | A | 85.4 | 1 | HBACK | 1.518 | 0 | 0.0000 | 716.53 | 48 |
| 51922471 | 7/4/2017 | F | NA | 1975-10-06 | 652 | A | 77.7 | 4 | HBACK | 0.495 | 0 | 0.0000 | 716.53 | 42 |
| 61781876 | 5/11/2017 | F | 3 | 1970-01-27 | 843 | D | 88.0 | 3 | SEDAN | 1.078 | 0 | 0.0000 | 716.53 | 48 |
| 72643150 | 3/20/2017 | F | 3 | 1970-11-13 | 660 | C | 148.5 | 1 | STNWG | 5.566 | 0 | 0.0000 | 716.53 | 47 |
| 29718559 | 10/11/2017 | F | 1 | 1990-11-04 | 546 | D | 41.5 | 2 | SEDAN | 1.716 | 1 | 5096.5742 | 716.53 | 27 |
| 42078978 | 11/6/2017 | M | 4 | 1968-11-19 | 775 | B | 132.5 | 4 | PANVN | 0.495 | 0 | 0.0000 | 716.53 | 49 |
| 35887851 | 1/9/2017 | F | 3 | 1976-11-18 | 661 | A | 84.3 | 3 | SEDAN | 1.232 | 0 | 0.0000 | 716.53 | 41 |
Missing values for variable agecat, credit_score and traffic_index were imputed in this section. agecat for missing values was calculated using date_of_birth and continuous variables: credit_score and traffic_index were imputed using MICE package. The technique used for their imputation was: Predictive Mean Matching (pmm). The data was found to be missing randomly (ideal scenario). For example, in the plot below; red left vertical boxplot shows the distribution of credit_score where traffic_index is missing and blue left vertical boxplot shows where both are present. Same analogy for the bottom horizontal boxplots.
For imputing the missing data in both the 2017 (training) & 2018 (testing) datasets, 2018 data was first appended to 2017 data and then the missing value was imputed for NAs in both the sets.
Below plot shows that all the missing values have been immputed now.
# Training Dataset
raw_data_2017<-raw_data_2017 %>%
mutate(agecat = ifelse(is.na(agecat),
ifelse(18<=age & age<28,1,
ifelse(28<=age & age<38,2,
ifelse(38<=age & age<48,3,
ifelse(48<=age & age<58,4,
ifelse(58<=age & age<68,5,6))))),agecat))
# Testing Dataset
raw_data_2018<-raw_data_2018 %>%
mutate(agecat = ifelse(is.na(agecat),
ifelse(18<=age & age<28,1,
ifelse(28<=age & age<38,2,
ifelse(38<=age & age<48,3,
ifelse(48<=age & age<58,4,
ifelse(58<=age & age<68,5,6))))),agecat))
# Keeping only relevant columns ftr the imputing process
raw_data_2017<-raw_data_2017[,c("gender","age","agecat","credit_score","area","traffic_index","veh_age","veh_body","veh_value")]
raw_data_2018<-raw_data_2018[,c("gender","age","agecat","credit_score","area","traffic_index","veh_age","veh_body","veh_value")]
# Correcting Data Types
raw_data_2017$gender<-as.factor(raw_data_2017$gender)
raw_data_2017$agecat<-as.factor(raw_data_2017$agecat)
raw_data_2017$area<-as.factor(raw_data_2017$area)
raw_data_2017$veh_body<-as.factor(raw_data_2017$veh_body)
raw_data_2018$gender<-as.factor(raw_data_2018$gender)
raw_data_2018$agecat<-as.factor(raw_data_2018$agecat)
raw_data_2018$area<-as.factor(raw_data_2018$area)
raw_data_2018$veh_body<-as.factor(raw_data_2018$veh_body)
# Imputing the missing data using MICE package
combined_data<-rbind(raw_data_2017,raw_data_2018)
init = mice(combined_data, maxit=0)
meth = init$method
meth[c("gender")]=""
meth[c("age")]=""
meth[c("agecat")]=""
meth[c("area")]=""
meth[c("veh_age")]=""
meth[c("veh_body")]=""
meth[c("veh_value")]=""
meth[c("credit_score")]="pmm"
meth[c("traffic_index")]="pmm"
imputed = mice(combined_data, method=meth, m=1,maxit = 5,seed = 500)
imputed <- complete(imputed)
aggr_plot <- aggr(imputed, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
labels=names(imputed), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
Next, a binary response variable was created in 2017 data as claim_YN based on column claimcst0 (1 if some amount was claimed 0 if no amount was claimed).
index<-1:60392
imp_2017<-imputed[index,]
imp_2018<-imputed[-index,]
imp_2017<-cbind(imp_2017,numclaims,claimcst0)
imp_2017$claim_YN<-as.factor(ifelse(claimcst0>0,1,0)) # Response needs to be a category
ASSUMPTION: Some vehicles had veh_value as zeroes, which doesn’t sound practical. So below code changes the 0 vehicle values to max of their vehicle_age + vehicle_body category combination. Maximum because generally more risk is associated with higher valued vehicles and in this challenge one of our main task is to identify risky policies.
# Changing vehicles with 0 value to max of their vehicle_age + vehicle_body category
imp_2017<-imp_2017 %>%
mutate(veh_value = ifelse(veh_value==0,
ifelse(veh_body=="BUS",7.1830,
ifelse(veh_body=="MCARA",13.6400,
ifelse(veh_body=="MIBUS",5.0160,
ifelse(veh_body=="SEDAN",24.9590,
ifelse(veh_body=="STNWG",12.5400,6.2700))))),veh_value))
imp_2018<-imp_2018 %>%
mutate(veh_value = ifelse(veh_value==0,
ifelse(veh_body=="BUS",7.1830,
ifelse(veh_body=="MCARA",13.6400,
ifelse(veh_body=="MIBUS",5.0160,
ifelse(veh_body=="SEDAN",24.9590,
ifelse(veh_body=="STNWG",12.5400,6.2700))))),veh_value))
Preview after cleaning and imputation:
| gender | age | agecat | credit_score | area | traffic_index | veh_age | veh_body | veh_value | numclaims | claimcst0 | claim_YN |
|---|---|---|---|---|---|---|---|---|---|---|---|
| M | 29 | 2 | 631 | B | 140.9 | 4 | TRUCK | 0.924 | 0 | 0.0000 | 0 |
| M | 33 | 2 | 531 | C | 136.5 | 3 | HBACK | 1.430 | 1 | 583.0109 | 1 |
| M | 76 | 6 | 838 | D | 88.8 | 3 | SEDAN | 1.100 | 1 | 159.3758 | 1 |
| M | 59 | 5 | 835 | E | 28.5 | 2 | SEDAN | 2.090 | 0 | 0.0000 | 0 |
| F | 51 | 4 | 748 | C | 123.0 | 3 | HBACK | 0.803 | 1 | 143.5556 | 1 |
| M | 61 | 5 | 785 | B | 108.6 | 2 | SEDAN | 1.903 | 0 | 0.0000 | 0 |
| M | 41 | 3 | 759 | E | 75.0 | 4 | STNWG | 1.452 | 0 | 0.0000 | 0 |
| F | 70 | 6 | 836 | C | 88.5 | 1 | HBACK | 1.397 | 0 | 0.0000 | 0 |
| M | 51 | 4 | 688 | A | 50.0 | 1 | SEDAN | 2.838 | 0 | 0.0000 | 0 |
| M | 33 | 2 | 503 | B | 134.1 | 3 | HBACK | 1.936 | 1 | 1039.3983 | 1 |
| F | 43 | 3 | 830 | D | 132.2 | 4 | STNWG | 1.485 | 0 | 0.0000 | 0 |
| F | 35 | 2 | 649 | E | 27.0 | 2 | STNWG | 3.641 | 0 | 0.0000 | 0 |
| M | 50 | 4 | 424 | D | 84.2 | 4 | TRUCK | 1.595 | 1 | 5087.3893 | 1 |
| M | 48 | 4 | 792 | A | 85.4 | 1 | HBACK | 1.518 | 0 | 0.0000 | 0 |
| F | 42 | 3 | 652 | A | 77.7 | 4 | HBACK | 0.495 | 0 | 0.0000 | 0 |
| F | 48 | 3 | 843 | D | 88.0 | 3 | SEDAN | 1.078 | 0 | 0.0000 | 0 |
| F | 47 | 3 | 660 | C | 148.5 | 1 | STNWG | 5.566 | 0 | 0.0000 | 0 |
| F | 27 | 1 | 546 | D | 41.5 | 2 | SEDAN | 1.716 | 1 | 5096.5742 | 1 |
| M | 49 | 4 | 775 | B | 132.5 | 4 | PANVN | 0.495 | 0 | 0.0000 | 0 |
| F | 41 | 3 | 661 | A | 84.3 | 3 | SEDAN | 1.232 | 0 | 0.0000 | 0 |
Below plots were generated to understand the pattern of claims across variables.
Following conclusions were drawn from the exploratory data analysis:
From the correlation matrix and jitter plot below we can say that:
veh_value and veh_age are strongly correlated (cor = -0.53): as intuitionclaimcst0) is weakly correlated with credit score (cor = -0.28)claimcst0 & numclaims are also correlated which obviosuly makes sense, since more the number of claims more will the total claim amountAfter data processing and exploration, modelling was started with splitting the 2017 data into training (80%) and validation (20%) (pareto rule). A seed was set to make the results reproducible. Both the resulting training and validation (named testing here) had 16% of claims.
Three machine learning models were fit onto the training dataset for classification of potentially risky and non-risky claims.
Logistic Regression was used as the baseline model to compare other models. *AUC and misclassification rate were checked for the validation set and the model which had the highest AUC and misclassification rate was selected for further comaprison with non-linear models. To start with , total three logistic models were compared:
Model 1: claim_YN ~ .
Model 2: claim_YN ~ . - agecat
Model 3: claim_YN ~ . - agecat - veh_age - veh_body
log.fit1<-glm(claim_YN ~ .,family=binomial(link='logit'),data=training)
log.fit2<-glm(claim_YN ~ .-agecat,family=binomial(link='logit'),data=training)
log.fit3<-glm(claim_YN ~ .-agecat -veh_age -veh_body ,family=binomial(link='logit'),data=training)
log.pred<-predict(log.fit2,testing,type = "response")
Out of above three models, Model 2 was selected because:
agecat removed in order to reduce the variance inflationModel 2 AIC value is slightly lower than model 3AIC(log.fit2)
## [1] 34246.53
AIC(log.fit3)
## [1] 34253.97
Notable observations from Logistic Model Model 2:
claim_YN has some linear relationship with gender, age, credit_score, area and traffic_indexclaim_YN doesn’t have linear relationship with other variables like veh_value, veh_age and veh_bodySo logistic model says that features related to vehicles don’t play any role in claim.
Next we fit a random forest model with all the variables. It was found that, here removing the variables actually increased the misclassification error on the validation set. Hence it was decided to keep a full model for random forest. Notable observations:
claim_YN with veh_value, which logistic model was missing outset.seed(500)
rf.fit<-randomForest(claim_YN ~ .-agecat,ntree = 100, data = training)
rf.pred<-predict(rf.fit,testing,type = "prob")[,2]
Lastly xgboost model was fit. Here the model without agecat was better than a full model and a subset model in terms of misclassification rate o nthe validation set.
predictors<-c("gender","age","credit_score","area","traffic_index","veh_age","veh_body","veh_value")
output_vector<-as.numeric(training[,"claim_YN"])-1
sparse_matrix_train <- sparse.model.matrix(claim_YN ~ ., data = training[,-3])[,-1]
sparse_matrix_test <- sparse.model.matrix(claim_YN ~ ., data = testing[,-3])[,-1]
set.seed(500)
xg.fit <- xgboost(data = sparse_matrix_train, label = output_vector, max_depth = 6,
eta = 0.3, nthread = 2, nrounds = 200,objective = "binary:logistic")
xgb.pred<-predict(xg.fit, sparse_matrix_test)
Below plot indicates the important variables:
traffic_index and credit_score still seem to be the most important predictors, followed by gender, age and veh_valueBelow plot shows the AUC curves for all the three models and reveals that logistic model is so far the best model.
| Model | AUC |
|---|---|
| Logistic Regression | 0.8096599 |
| Random Forest | 0.7937251 |
| XGBoost | 0.7928353 |
Logistic model Model 2 was finalized because of its higher AUC value. Now using this model a cutoff probability was determined using the gridsearch in order to minimize the misclassification rate. A cutoff probabilty would determine what risk probability is enough beyond which we can’t classify someone as a sale customer.
pr<-seq(0,1,0.02)
accry<-c()
for(i in 1:length(pr)){
log.pred<-ifelse(predict(log.fit2,testing,type = "response") >= pr[i],1,0)
accry[i]<-sum(log.pred == testing$claim_YN)/nrow(testing)
}
log.tab<-data.frame(cbind("prob" = pr, "accuracy" = accry))
ggplot(log.tab,aes(x=prob,y=accuracy)) +
geom_point(show.legend=F) +
xlab("Threshold cutoff probability") +
ylab("Accuracy") +
ggtitle("Threshold Vs Accuracy (Logistic)") + theme(plot.title = element_text(hjust=0.5)) +
geom_point(aes(x=log.tab[log.tab$accuracy==max(log.tab$accuracy),][[1]],y=log.tab[log.tab$accuracy==max(log.tab$accuracy),][[2]],colour="red",cex=4))
Final predictions were made using the above found cutoff probability (0.48) and saved in the file.
final_predictions<-data.frame(cbind("Quote Numbers" = quote_number,imp_2018,"Risk" = ifelse(predict(log.fit1,imp_2018,type = "response")>=0.48,1,0)))
table(final_predictions$Risk)
potential_non_risky<-data.frame("Quote Numbers" = final_predictions[final_predictions$Risk==0,1])
potential_risky<-data.frame("Quote Numbers" = final_predictions[final_predictions$Risk==1,1])
write.csv(potential_non_risky,"potential_non_risky_customers.csv", row.names = FALSE)
ASSUMPTON: LOW RISK customers needed to be defined. So in this section we would mean the bottom 15% customers in terms of risk among all risky customers. (This is %ile can be any number according to business requirement).
Section 1 identified potential risky customers, this section calculated the potential cost for the claims in the cases where we can expect a claim to happen. Below code calculates the cost per claim per policy by taking the ratio of claimcst0 & numclaims, after filtering for risky customers only.
claim_cust<-imp_2017 %>%
mutate(cost_per_claim = ifelse(is.nan(claimcst0/numclaims),0,round((claimcst0/numclaims),2))) %>%
select(-c(10:12)) %>%
filter(cost_per_claim>0)
Below corrleation matrix and histogram suggest that:
cost_per_claim is mildly correlated with veh_age and veh_valuecost_per_claim is highly skewed (and thus needs a log transformation)Before log transformation:
After log transformation:
Once again we would split our data into 80-20.
set.seed(500)
index<-sample(nrow(claim_cust),0.8*nrow(claim_cust),replace = FALSE)
training<-claim_cust[index,]
testing<-claim_cust[-index,]
Multiple Linear Regression was used as the baseline model to compare other models. Mean Squared Error was checked for the validation set and the linear model which had the lowest MSE was selected for further comaprison with non-linear models. To start with, total three linear models were compared:
Model 1: cost_per_claim ~ .
Model 2: cost_per_claim ~ .- agecat
Model 3: cost_per_claim ~ .-veh_age -agecat
Notable observations (after looking at model summaries):
Model 1, coefficient sign of age changed from -ve to +ve showing its collinearity with agecatveh_age, the positive coefficient of veh_value increased (showing effect of collinearity with veh_age)lm.cpc.fit1<-lm(cost_per_claim ~ .,data = training)
lm.cpc.fit2<-lm(cost_per_claim ~ .- agecat,data = training)
lm.cpc.fit3<-lm(cost_per_claim ~ .-veh_age -agecat,data = training)
lm.mse1<-sum((predict(lm.cpc.fit1,testing)-testing$cost_per_claim)^2)/nrow(testing)
lm.mse2<-sum((predict(lm.cpc.fit2,testing)-testing$cost_per_claim)^2)/nrow(testing)
lm.mse3<-sum((predict(lm.cpc.fit3,testing)-testing$cost_per_claim)^2)/nrow(testing)
Model 1 was selected among linear models for further comparison becuase of its low MSE 0.4438146 (others being 0.524292 & 0.6931242).
Below is the residual plot which assures that the normality assumption was not violated.
plot(lm.cpc.fit1)
Next a random forest model was fit with all variables. Again removing the variables actually increased the MSE on the validation set. Hence it was decided to keep a full model for random forest.
set.seed(500)
rf.cpc.fit1<-randomForest(cost_per_claim ~ ., ntree = 300,data = training)
rf.mse1<-sum((predict(rf.cpc.fit1,testing)-testing$cost_per_claim)^2)/nrow(testing)
plot(rf.cpc.fit1)
Notable observations:
claim_YN, vehicle factors like veh_value and veh_age are coming out to be significant for cost_per_claimvarImpPlot(rf.cpc.fit1,type=2, main = "Credit Score and Vehicle age are the most important predictors")
Xgboost was used to fit a thrid model. Here it was found that while full model was not best but removing veh_age was not appropriate unlike randomForest. So here we would be keeping model 2
Model 1: cost_per_claim ~ .
Model 2: cost_per_claim ~ .- agecat
Model 3: cost_per_claim ~ .-veh_age -agecat
output_vector<-as.numeric(training[,"cost_per_claim"])
sparse_matrix_train <- sparse.model.matrix(cost_per_claim ~ ., data = training[,-3])[,-1]
sparse_matrix_test <- sparse.model.matrix(cost_per_claim ~ ., data = testing[,-3])[,-1]
set.seed(500)
xg.fit2 <- xgboost(data = sparse_matrix_train, label = output_vector, max_depth = 6,
eta = 0.3, nthread = 2, nrounds = 200,objective = "reg:linear")
importance <- xgb.importance(feature_names = colnames(sparse_matrix_train), model = xg.fit2)
xgb.mse2<-sum((predict(xg.fit2, sparse_matrix_test)-testing$cost_per_claim)^2)/nrow(testing)
Notable observation:
Since MSE was the lowest for the linear model 0.4438146 than others (Random Forest = rf.mse1 & XGboost = xgb.mse2), we would use this model fro the predictions.
Code below does anti-log of previously transformed cost_per_claim The file potential_low_risky_customers.csv which is getting extracted below would be having bottom 15% low risk customers.
potential_risky<-data.frame(cbind(potential_risky,
"Cost_Per_Claim" = predict(lm.cpc.fit1,final_predictions[final_predictions$Quote.Numbers %in% potential_risky$Quote.Numbers,-c(1,11)])))
potential_risky$real_cost_per_claim<-exp(potential_risky$Cost_Per_Claim)
low_risky<-potential_risky %>%
mutate(quantile = ntile(real_cost_per_claim, 100)) %>%
filter(quantile<=15) %>% # Selecting las t15%iles to be low-risky customers (selection of cutoff depends on business)
select(Quote.Numbers)
write.csv(low_risky,"potential_low_risky_customers.csv",row.names = FALSE)
In order to profile the customers into risk groups, an optimum number of clusters was required such that the groups would be as different as possible. To achieve this, Partition Around Medoids realisation of K-medoid clustering algorithm was used , which suggested an optimum number of 4 segments.
ASSUMPTION: To profile customers based on their risks to claim, certain weigths were given to the features according to their importance/significance till now in the analysis. For example: Risk_Prob was given the highest weight 7, followed by traffic_index,credit_score and gender,age and veh_body were given the least.
Below code does the follwoing steps in order:
features<-c("Quote.Numbers","gender","age","credit_score","area","traffic_index",
"veh_age","veh_body","veh_value","Risk","Cost_Per_Claim","real_cost_per_claim")
cust_all_risk<-final_predictions %>%
mutate("Risk_Prob" = predict(log.fit1,imp_2018,type = "response")) %>%
# select(c("Quote.Numbers","gender","age","credit_score","veh_body","veh_value","traffic_index","Risk","Risk_Prob")) %>%
select(c("Quote.Numbers","age","credit_score","veh_value","traffic_index","Risk_Prob","Risk")) %>%
filter(Risk == 1) %>%
select(c("Quote.Numbers","age","credit_score","veh_value","traffic_index","Risk_Prob"))
# "gender" "age" "credit_score" "veh_value" "traffic_index" "Risk" "Risk_Prob"
# Calculate Gower Distance
gower_dist <- daisy(cust_all_risk[,-1],metric = "gower", type = list(logratio = c(1,2,3,4,5))) # , weights = c(2,3,7,4,5,7,0,8)
# Log transformation for positively skewed variables: FAMILY_TOT_SALES, FAMILY_TOT_VISITS
# Calculate optimal number of clusters
sil_width <- c(NA)
for(i in 2:13){
set.seed(i)
pam_fit<-pam(gower_dist, diss = TRUE,k = i) # PAM: Partitioning Around Medoids
sil_width[i]<-pam_fit$silinfo$avg.width
}
tab<-data.frame(x=1:13,sil_width=sil_width)
ggplot(data=tab,aes(x = x,y = sil_width)) +
geom_point(cex=3,col="red")+geom_line() +
ggtitle("Silhoutte Width Vs Number of clusters") +
theme(plot.title = element_text(hjust=0.5)) +
xlab("Number of clusters")
# Number of clusters suggested by silhoutte analysis: 5
Next we group the risky customers using pam() from cluster package based on gower distance. Below are the final visualization of the four risk segments:
set.seed(5)
pam_fit<-pam(gower_dist, diss=TRUE, k = 5)
cust_all_risk<-cbind(cust_all_risk, Group = pam_fit$clustering)
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering),
name = cust_all_risk$Quote.Numbers)
ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = cluster)) +
ggtitle("Customer Segments") +
theme(plot.title = element_text(hjust = 0.5))
This is how the final segments land their group averages for age, probability of risk credit_score and traffic_index look like.
| Group | count_cust | avg_age | min_prob | med_prob | max_prob | min_cred_scr | med_cred_scr | max_cred_scr | min_tf_in | med_tf_in | max_tf_in |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 138 | 63.29 | 0.48 | 0.64 | 0.82 | 301 | 376 | 575 | 42 | 125 | 169.4 |
| 2 | 88 | 26.78 | 0.49 | 0.6 | 0.71 | 301 | 370 | 520 | 16.5 | 103.2 | 156 |
| 3 | 98 | 22.74 | 0.48 | 0.58 | 0.84 | 454 | 614.5 | 754 | 24.7 | 133.5 | 174.5 |
| 4 | 104 | 44.34 | 0.48 | 0.52 | 0.71 | 303 | 491 | 708 | 42 | 124.55 | 186.9 |
| 5 | 137 | 32.66 | 0.65 | 0.77 | 0.95 | 301 | 344 | 501 | 38 | 124.5 | 223.7 |
Here’s the summary and key take-aways of the above segments:
NOTE: Ideal number of segments suggested was 11, but 11 number of segments was too large for the campaigns and 5 was a trade-off number.