By varying k from 2 to K=20 and saving models we can track how k-means relevant metrics change.
K = 20
kmeansModels = list()
for(k in 2:K) {
kmeansCustomers1 = computeKmeans(taConnection, "p17.cust_metrics_summary_june2016", centers=k,
include=c('voice_minutes_avg', 'sms_usage_avg', 'data_usage_avg'),
scale = TRUE, id='cust_id', idAlias = "cust_id",
where="plan IN ('Plan 40','Plan 30','Plan 35','Plan 39','Plan 25','Plan 50')",
scaledTableName=paste0('kmeans_customer_scaled_k_',k),
centroidTableName=paste0('kmeans_customer_centroids_k_',k), schema='p17',
iterMax = 500)
kmeansModels[[k-1]] = kmeansCustomers1
}
totalRunningTimeMinutes = sum(sapply(kmeansModels, FUN = function(x){x$time['elapsed']}))/60
# Graph for Total Withinness
data = data.frame(K=integer(0), totwithinss=numeric(0))
for(n in 2:(length(kmeansModels)+1)) {
kmeansModel = kmeansModels[[n-1]]
data = rbind(data,
data.frame(K=n, totwithinss=kmeansModel$tot.withinss))
}
ggplot(data) +
geom_line(aes(K, totwithinss), size=1) + geom_point(aes(K, totwithinss), size=2) +
labs(title="Within groups Sums of Squares vs. the Number of Clusters",
x="Number of Clusters", y="Within groups Sums of Squares") +
scale_x_continuous(breaks=seq(0,K,2)) +
scale_y_continuous(labels = comma) +
theme_minimal(base_size = 12, base_family = 'serif')
We pick 6 cluster model based on the elbow position:
k = 6
kmeans.model = computeKmeans(taConnection, "p17.cust_metrics_summary_june2016", centers=k,
include=c('voice_minutes_avg', 'sms_usage_avg', 'data_usage_avg'),
scale = TRUE,
aggregates = c("COUNT(*) cnt", "AVG(age) avg_age",
"SUM(CASE WHEN plan = 'Plan 40' THEN 1 ELSE 0 END) plan40_count",
"SUM(CASE WHEN plan = 'Plan 30' THEN 1 ELSE 0 END) plan30_count",
"SUM(CASE WHEN plan = 'Plan 35' THEN 1 ELSE 0 END) plan35_count",
"SUM(CASE WHEN plan = 'Plan 39' THEN 1 ELSE 0 END) plan39_count",
"SUM(CASE WHEN plan = 'Plan 25' THEN 1 ELSE 0 END) plan25_count",
"SUM(CASE WHEN plan = 'Plan 50' THEN 1 ELSE 0 END) plan50_count"),
id='cust_id', idAlias = "cust_id",
where="plan IN ('Plan 40','Plan 30','Plan 35','Plan 39','Plan 25','Plan 50')",
scaledTableName=paste0('kmeans_picked_customer_scaled_k_',k),
centroidTableName=paste0('kmeans_picked_customer_centroids_k_',k), schema='p17',
iterMax = 500)
createCentroidPlot(kmeans.model, format="line", groupByCluster=TRUE, coordFlip = FALSE,
title = "Mobile Customer Clustering: Centroids")
createCentroidPlot(kmeans.model, format="line", groupByCluster=FALSE, coordFlip = FALSE,
title = "Mobile Customer Clustering: Centroids")
createClusterPlot(kmeans.model)
kmeans.model = computeClusterSample(taConnection, kmeans.model, 0.05)
createClusterPairsPlot(kmeans.model)
voiceMinutesHist = computeHistogram(taConnection, tableName = "p17.cust_metrics_summary_june2016",
columnName = "voice_minutes_avg", tableInfo = tableInfo,
numbins = 80, startvalue = 0, endvalue = 80)
createHistogram(voiceMinutesHist,
title="Monthly Average Voice Usage", xlab="Minutes", ylab="Count",
baseSize = 12, baseFamily = 'serif')
smsHist = computeHistogram(taConnection, tableName = "p17.cust_metrics_summary_june2016",
columnName = "sms_usage_avg", tableInfo = tableInfo,
numbins = 80, startvalue = 0, endvalue = 80)
createHistogram(smsHist,
title="Monthly Average SMS Usage", xlab="Tex Message Count", ylab="Count",
baseSize = 12, baseFamily = 'serif')
dataHist = computeHistogram(taConnection, tableName = "p17.cust_metrics_summary_june2016",
columnName = "data_usage_avg", tableInfo = tableInfo,
numbins = 50, startvalue = 0, endvalue = 10)
createHistogram(dataHist,
title="Monthly Average Data Usage", xlab="Minutes", ylab="Count",
baseSize = 12, baseFamily = 'serif')
We reduce number of clusters by 1 since it new data set will not contain inactive users that occupied most populated cluster before. Note augmented where clause argument that filters out customers with low usage for all 3 parameters.
k = 5
kmeansActiveModel = computeKmeans(taConnection, "p17.cust_metrics_summary_june2016", centers=k,
include=c('voice_minutes_avg', 'sms_usage_avg', 'data_usage_avg'),
scale = TRUE,
aggregates = c("COUNT(*) cnt", "AVG(age) avg_age",
"SUM(CASE WHEN plan = 'Plan 40' THEN 1 ELSE 0 END) plan40_count",
"SUM(CASE WHEN plan = 'Plan 30' THEN 1 ELSE 0 END) plan30_count",
"SUM(CASE WHEN plan = 'Plan 35' THEN 1 ELSE 0 END) plan35_count",
"SUM(CASE WHEN plan = 'Plan 39' THEN 1 ELSE 0 END) plan39_count",
"SUM(CASE WHEN plan = 'Plan 25' THEN 1 ELSE 0 END) plan25_count",
"SUM(CASE WHEN plan = 'Plan 50' THEN 1 ELSE 0 END) plan50_count"),
id='cust_id', idAlias = "cust_id",
where="plan IN ('Plan 40','Plan 30','Plan 35','Plan 39','Plan 25','Plan 50')
AND voice_minutes_avg >= 10 AND sms_usage_avg >= 10 AND data_usage_avg >= 2",
scaledTableName=paste0('kmeans_customer_active_scaled_k_',k),
centroidTableName=paste0('kmeans_customer_active_centroids_k_',k), schema='p17',
iterMax = 500)
createCentroidPlot(kmeansActiveModel, format="line", groupByCluster=TRUE, coordFlip = FALSE,
title = "Mobile Customer Clustering: Centroids")
createCentroidPlot(kmeansActiveModel, format="line", groupByCluster=FALSE, coordFlip = FALSE,
title = "Mobile Customer Clustering: Centroids")
createClusterPlot(kmeansActiveModel)
kmeansActiveModel = computeClusterSample(taConnection, kmeansActiveModel, 0.05)
createClusterPairsPlot(kmeansActiveModel)