library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
customers <- read.xlsx("D:/Analytics3/bikeshops.xlsx", sheetIndex = 1)
products <- read.xlsx("D:/Analytics3//bikes.xlsx", sheetIndex = 1)
orders <- read.xlsx("D:/Analytics3/orders.xlsx", sheetIndex = 1)
# Combine orders, customers, and products data frames --------------------------
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
orders.extended <- merge(orders, customers, by.x = "customer.id", by.y="bikeshop.id")
orders.extended <- merge(orders.extended, products, by.x = "product.id", by.y = "bike.id")
orders.extended <- orders.extended %>%
mutate(price.extended = price * quantity) %>%
select(order.date, order.id, order.line, bikeshop.name, model,
quantity, price, price.extended, category1, category2, frame) %>%
arrange(order.id, order.line)
knitr::kable(head(orders.extended)) # Preview the data
2011-01-07 |
1 |
1 |
Ithaca Mountain Climbers |
Jekyll Carbon 2 |
1 |
6070 |
6070 |
Mountain |
Over Mountain |
Carbon |
2011-01-07 |
1 |
2 |
Ithaca Mountain Climbers |
Trigger Carbon 2 |
1 |
5970 |
5970 |
Mountain |
Over Mountain |
Carbon |
2011-01-10 |
2 |
1 |
Kansas City 29ers |
Beast of the East 1 |
1 |
2770 |
2770 |
Mountain |
Trail |
Aluminum |
2011-01-10 |
2 |
2 |
Kansas City 29ers |
Trigger Carbon 2 |
1 |
5970 |
5970 |
Mountain |
Over Mountain |
Carbon |
2011-01-10 |
3 |
1 |
Louisville Race Equipment |
Supersix Evo Hi-Mod Team |
1 |
10660 |
10660 |
Road |
Elite Road |
Carbon |
2011-01-10 |
3 |
2 |
Louisville Race Equipment |
Jekyll Carbon 4 |
1 |
3200 |
3200 |
Mountain |
Over Mountain |
Carbon |
# Group by model & model features, summarize by quantity purchased -------------
library(tidyr) # Needed for spread function
customerTrends <- orders.extended %>%
group_by(bikeshop.name, model, category1, category2, frame, price) %>%
summarise(total.qty = sum(quantity)) %>%
spread(bikeshop.name, total.qty)
customerTrends[is.na(customerTrends)] <- 0 # Remove NA's
# Convert price to binary high/low category ------------------------------------
library(Hmisc) # Needed for cut2 function
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
customerTrends$price <- cut2(customerTrends$price, g=2)
# Convert customer purchase quantity to percentage of total quantity -----------
customerTrends.mat <- as.matrix(customerTrends[,-(1:5)]) # Drop first five columns
customerTrends.mat <- prop.table(customerTrends.mat, margin = 2) # column-wise pct
customerTrends <- bind_cols(customerTrends[,1:5], as.data.frame(customerTrends.mat))
# Running the k-means algorithm -------------------------------------------------
library(cluster) # Needed for silhouette function
kmeansDat <- customerTrends[,-(1:5)] # Extract only customer columns
kmeansDat.t <- t(kmeansDat) # Get customers in rows and products in columns
#View(kmeansDat.t)
# Setup for k-means loop
km.out <- list()
sil.out <- list()
x <- vector()
y <- vector()
minClust <- 4 # Hypothesized minimum number of segments
maxClust <- 8 # Hypothesized maximum number of segments
# Compute k-means clustering over various clusters, k, from minClust to maxClust
for (centr in minClust:maxClust) {
i <- centr-(minClust-1) # relevels start as 1, and increases with centr
set.seed(11) # For reproducibility
km.out[i] <- list(kmeans(kmeansDat.t, centers = centr, nstart = 50))
sil.out[i] <- list(silhouette(km.out[[i]][[1]], dist(kmeansDat.t)))
# Used for plotting silhouette average widths
x[i] = centr # value of k
y[i] = summary(sil.out[[i]])[[4]] # Silhouette average width
}
# Plot silhouette results to find best number of clusters; closer to 1 is better
library(ggplot2)
ggplot(data = data.frame(x, y), aes(x, y)) +
geom_point(size=3) +
geom_line() +
xlab("Number of Cluster Centers") +
ylab("Silhouette Average Width") +
ggtitle("Silhouette Average Width as Cluster Center Varies")

# Get customer names that are in each segment ----------------------------------
# Get attributes of optimal k-means output
maxSilRow <- which.max(y) # Row number of max silhouette value
optimalClusters <- x[maxSilRow] # Number of clusters
km.out.best <- km.out[[maxSilRow]] # k-means output of best cluster
# Create list of customer names for each cluster
clusterNames <- list()
clusterList <- list()
for (clustr in 1:optimalClusters) {
clusterNames[clustr] <- paste0("X", clustr)
clusterList[clustr] <- list(
names(
km.out.best$cluster[km.out.best$cluster == clustr]
)
)
}
names(clusterList) <- clusterNames
print(clusterList)
## $X1
## [1] "Ithaca Mountain Climbers" "Pittsburgh Mountain Machines"
## [3] "Tampa 29ers"
##
## $X2
## [1] "Philadelphia Bike Shop" "San Antonio Bike Shop"
##
## $X3
## [1] "Ann Arbor Speed" "Austin Cruisers"
## [3] "Indianapolis Velocipedes" "Miami Race Equipment"
## [5] "Nashville Cruisers" "New Orleans Velocipedes"
## [7] "Oklahoma City Race Equipment" "Seattle Race Equipment"
##
## $X4
## [1] "Albuquerque Cycles" "Dallas Cycles"
## [3] "Denver Bike Shop" "Detroit Cycles"
## [5] "Kansas City 29ers" "Los Angeles Cycles"
## [7] "Minneapolis Bike Shop" "New York Cycles"
## [9] "Phoenix Bi-peds" "Portland Bi-peds"
## [11] "Providence Bi-peds"
##
## $X5
## [1] "Cincinnati Speed" "Columbus Race Equipment"
## [3] "Las Vegas Cycles" "Louisville Race Equipment"
## [5] "San Francisco Cruisers" "Wichita Speed"
# Combine cluster centroids with bike models for feature inspection ------------
custSegmentCntrs <- t(km.out.best$centers) # Get centroids for groups
colnames(custSegmentCntrs) <- make.names(colnames(custSegmentCntrs))
customerTrends.clustered <- bind_cols(customerTrends[,1:5], as.data.frame(custSegmentCntrs))
# Arrange top 10 bike models by cluster in descending order --------------------
attach(customerTrends.clustered) # Allows ordering by column name
knitr::kable(head(customerTrends.clustered[order(-X1), c(1:5, 6)], 10))
Scalpel-Si Carbon 3 |
Mountain |
Cross Country Race |
Carbon |
[3500,12790] |
0.0342692 |
Jekyll Carbon 4 |
Mountain |
Over Mountain |
Carbon |
[ 415, 3500) |
0.0302818 |
Scalpel 29 Carbon Race |
Mountain |
Cross Country Race |
Carbon |
[3500,12790] |
0.0280391 |
Trigger Carbon 3 |
Mountain |
Over Mountain |
Carbon |
[3500,12790] |
0.0259353 |
Habit Carbon 2 |
Mountain |
Trail |
Carbon |
[3500,12790] |
0.0233750 |
Trigger Carbon 4 |
Mountain |
Over Mountain |
Carbon |
[ 415, 3500) |
0.0232606 |
Catalyst 4 |
Mountain |
Sport |
Aluminum |
[ 415, 3500) |
0.0215639 |
Jekyll Carbon 2 |
Mountain |
Over Mountain |
Carbon |
[3500,12790] |
0.0210792 |
Supersix Evo Hi-Mod Dura Ace 2 |
Road |
Elite Road |
Carbon |
[3500,12790] |
0.0210578 |
Trigger Carbon 2 |
Mountain |
Over Mountain |
Carbon |
[3500,12790] |
0.0210433 |
# Arrange top 10 bike models by cluster in descending order
knitr::kable(head(customerTrends.clustered[order(-X2), c(1:5, 7)], 10))
Slice Ultegra |
Road |
Triathalon |
Carbon |
[ 415, 3500) |
0.0554531 |
Trigger Carbon 4 |
Mountain |
Over Mountain |
Carbon |
[ 415, 3500) |
0.0304147 |
CAAD12 105 |
Road |
Elite Road |
Aluminum |
[ 415, 3500) |
0.0270792 |
Beast of the East 3 |
Mountain |
Trail |
Aluminum |
[ 415, 3500) |
0.0263331 |
Trail 1 |
Mountain |
Sport |
Aluminum |
[ 415, 3500) |
0.0249397 |
Bad Habit 1 |
Mountain |
Trail |
Aluminum |
[ 415, 3500) |
0.0229976 |
F-Si Carbon 4 |
Mountain |
Cross Country Race |
Carbon |
[ 415, 3500) |
0.0224490 |
CAAD12 Disc 105 |
Road |
Elite Road |
Aluminum |
[ 415, 3500) |
0.0209568 |
Synapse Disc 105 |
Road |
Endurance Road |
Aluminum |
[ 415, 3500) |
0.0209568 |
Trail 2 |
Mountain |
Sport |
Aluminum |
[ 415, 3500) |
0.0203094 |
detach(customerTrends.clustered)