K Mean Clustering

library(xlsx)

## Loading required package: rJava

## Loading required package: xlsxjars

customers <- read.xlsx("D:/Analytics3/bikeshops.xlsx", sheetIndex = 1)
products <- read.xlsx("D:/Analytics3//bikes.xlsx", sheetIndex = 1) 
orders <- read.xlsx("D:/Analytics3/orders.xlsx", sheetIndex = 1)


# Combine orders, customers, and products data frames --------------------------
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

orders.extended <- merge(orders, customers, by.x = "customer.id", by.y="bikeshop.id")
orders.extended <- merge(orders.extended, products, by.x = "product.id", by.y = "bike.id")

orders.extended <- orders.extended %>%
  mutate(price.extended = price * quantity) %>%
  select(order.date, order.id, order.line, bikeshop.name, model,
         quantity, price, price.extended, category1, category2, frame) %>%
  arrange(order.id, order.line)

knitr::kable(head(orders.extended)) # Preview the data

order.date	order.id	order.line	bikeshop.name	model	quantity	price	price.extended	category1	category2	frame
2011-01-07	1	1	Ithaca Mountain Climbers	Jekyll Carbon 2	1	6070	6070	Mountain	Over Mountain	Carbon
2011-01-07	1	2	Ithaca Mountain Climbers	Trigger Carbon 2	1	5970	5970	Mountain	Over Mountain	Carbon
2011-01-10	2	1	Kansas City 29ers	Beast of the East 1	1	2770	2770	Mountain	Trail	Aluminum
2011-01-10	2	2	Kansas City 29ers	Trigger Carbon 2	1	5970	5970	Mountain	Over Mountain	Carbon
2011-01-10	3	1	Louisville Race Equipment	Supersix Evo Hi-Mod Team	1	10660	10660	Road	Elite Road	Carbon
2011-01-10	3	2	Louisville Race Equipment	Jekyll Carbon 4	1	3200	3200	Mountain	Over Mountain	Carbon

# Group by model & model features, summarize by quantity purchased -------------
library(tidyr)  # Needed for spread function
customerTrends <- orders.extended %>%
  group_by(bikeshop.name, model, category1, category2, frame, price) %>%
  summarise(total.qty = sum(quantity)) %>%
  spread(bikeshop.name, total.qty)
customerTrends[is.na(customerTrends)] <- 0  # Remove NA's


# Convert price to binary high/low category ------------------------------------
library(Hmisc)  # Needed for cut2 function

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

customerTrends$price <- cut2(customerTrends$price, g=2)   



# Convert customer purchase quantity to percentage of total quantity -----------
customerTrends.mat <- as.matrix(customerTrends[,-(1:5)])  # Drop first five columns
customerTrends.mat <- prop.table(customerTrends.mat, margin = 2)  # column-wise pct
customerTrends <- bind_cols(customerTrends[,1:5], as.data.frame(customerTrends.mat))



# Running the k-means algorithm -------------------------------------------------
library(cluster) # Needed for silhouette function

kmeansDat <- customerTrends[,-(1:5)]  # Extract only customer columns
kmeansDat.t <- t(kmeansDat)  # Get customers in rows and products in columns

#View(kmeansDat.t)

# Setup for k-means loop 
km.out <- list()
sil.out <- list()
x <- vector()
y <- vector()
minClust <- 4      # Hypothesized minimum number of segments
maxClust <- 8      # Hypothesized maximum number of segments

# Compute k-means clustering over various clusters, k, from minClust to maxClust
for (centr in minClust:maxClust) {
  i <- centr-(minClust-1) # relevels start as 1, and increases with centr
  set.seed(11) # For reproducibility
  km.out[i] <- list(kmeans(kmeansDat.t, centers = centr, nstart = 50))
  sil.out[i] <- list(silhouette(km.out[[i]][[1]], dist(kmeansDat.t)))
  # Used for plotting silhouette average widths
  x[i] = centr  # value of k
  y[i] = summary(sil.out[[i]])[[4]]  # Silhouette average width
}



# Plot silhouette results to find best number of clusters; closer to 1 is better
library(ggplot2)
ggplot(data = data.frame(x, y), aes(x, y)) + 
  geom_point(size=3) + 
  geom_line() +
  xlab("Number of Cluster Centers") +
  ylab("Silhouette Average Width") +
  ggtitle("Silhouette Average Width as Cluster Center Varies")

# Get customer names that are in each segment ----------------------------------

# Get attributes of optimal k-means output
maxSilRow <- which.max(y)          # Row number of max silhouette value
optimalClusters <- x[maxSilRow]    # Number of clusters
km.out.best <- km.out[[maxSilRow]] # k-means output of best cluster

# Create list of customer names for each cluster
clusterNames <- list()
clusterList <- list()
for (clustr in 1:optimalClusters) {
  clusterNames[clustr] <- paste0("X", clustr)
  clusterList[clustr] <- list(
    names(
      km.out.best$cluster[km.out.best$cluster == clustr]
    )
  )
}
names(clusterList) <- clusterNames

print(clusterList)

## $X1
## [1] "Ithaca Mountain Climbers"     "Pittsburgh Mountain Machines"
## [3] "Tampa 29ers"                 
## 
## $X2
## [1] "Philadelphia Bike Shop" "San Antonio Bike Shop" 
## 
## $X3
## [1] "Ann Arbor Speed"              "Austin Cruisers"             
## [3] "Indianapolis Velocipedes"     "Miami Race Equipment"        
## [5] "Nashville Cruisers"           "New Orleans Velocipedes"     
## [7] "Oklahoma City Race Equipment" "Seattle Race Equipment"      
## 
## $X4
##  [1] "Albuquerque Cycles"    "Dallas Cycles"        
##  [3] "Denver Bike Shop"      "Detroit Cycles"       
##  [5] "Kansas City 29ers"     "Los Angeles Cycles"   
##  [7] "Minneapolis Bike Shop" "New York Cycles"      
##  [9] "Phoenix Bi-peds"       "Portland Bi-peds"     
## [11] "Providence Bi-peds"   
## 
## $X5
## [1] "Cincinnati Speed"          "Columbus Race Equipment"  
## [3] "Las Vegas Cycles"          "Louisville Race Equipment"
## [5] "San Francisco Cruisers"    "Wichita Speed"

# Combine cluster centroids with bike models for feature inspection ------------
custSegmentCntrs <- t(km.out.best$centers)  # Get centroids for groups
colnames(custSegmentCntrs) <- make.names(colnames(custSegmentCntrs))
customerTrends.clustered <- bind_cols(customerTrends[,1:5], as.data.frame(custSegmentCntrs))


# Arrange top 10 bike models by cluster in descending order --------------------
attach(customerTrends.clustered)  # Allows ordering by column name
knitr::kable(head(customerTrends.clustered[order(-X1), c(1:5, 6)], 10))

model	category1	category2	frame	price	X1
Scalpel-Si Carbon 3	Mountain	Cross Country Race	Carbon	[3500,12790]	0.0342692
Jekyll Carbon 4	Mountain	Over Mountain	Carbon	[ 415, 3500)	0.0302818
Scalpel 29 Carbon Race	Mountain	Cross Country Race	Carbon	[3500,12790]	0.0280391
Trigger Carbon 3	Mountain	Over Mountain	Carbon	[3500,12790]	0.0259353
Habit Carbon 2	Mountain	Trail	Carbon	[3500,12790]	0.0233750
Trigger Carbon 4	Mountain	Over Mountain	Carbon	[ 415, 3500)	0.0232606
Catalyst 4	Mountain	Sport	Aluminum	[ 415, 3500)	0.0215639
Jekyll Carbon 2	Mountain	Over Mountain	Carbon	[3500,12790]	0.0210792
Supersix Evo Hi-Mod Dura Ace 2	Road	Elite Road	Carbon	[3500,12790]	0.0210578
Trigger Carbon 2	Mountain	Over Mountain	Carbon	[3500,12790]	0.0210433

# Arrange top 10 bike models by cluster in descending order
knitr::kable(head(customerTrends.clustered[order(-X2), c(1:5, 7)], 10))

model	category1	category2	frame	price	X2
Slice Ultegra	Road	Triathalon	Carbon	[ 415, 3500)	0.0554531
Trigger Carbon 4	Mountain	Over Mountain	Carbon	[ 415, 3500)	0.0304147
CAAD12 105	Road	Elite Road	Aluminum	[ 415, 3500)	0.0270792
Beast of the East 3	Mountain	Trail	Aluminum	[ 415, 3500)	0.0263331
Trail 1	Mountain	Sport	Aluminum	[ 415, 3500)	0.0249397
Bad Habit 1	Mountain	Trail	Aluminum	[ 415, 3500)	0.0229976
F-Si Carbon 4	Mountain	Cross Country Race	Carbon	[ 415, 3500)	0.0224490
CAAD12 Disc 105	Road	Elite Road	Aluminum	[ 415, 3500)	0.0209568
Synapse Disc 105	Road	Endurance Road	Aluminum	[ 415, 3500)	0.0209568
Trail 2	Mountain	Sport	Aluminum	[ 415, 3500)	0.0203094

detach(customerTrends.clustered)

K Mean Clustering

Vyomesh Upadhyay

October 22, 2018