library(xlsx) 
## Loading required package: rJava
## Loading required package: xlsxjars
customers <- read.xlsx("D:/Analytics3/bikeshops.xlsx", sheetIndex = 1)
products <- read.xlsx("D:/Analytics3//bikes.xlsx", sheetIndex = 1) 
orders <- read.xlsx("D:/Analytics3/orders.xlsx", sheetIndex = 1)


# Combine orders, customers, and products data frames --------------------------
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
orders.extended <- merge(orders, customers, by.x = "customer.id", by.y="bikeshop.id")
orders.extended <- merge(orders.extended, products, by.x = "product.id", by.y = "bike.id")

orders.extended <- orders.extended %>%
  mutate(price.extended = price * quantity) %>%
  select(order.date, order.id, order.line, bikeshop.name, model,
         quantity, price, price.extended, category1, category2, frame) %>%
  arrange(order.id, order.line)

knitr::kable(head(orders.extended)) # Preview the data
order.date order.id order.line bikeshop.name model quantity price price.extended category1 category2 frame
2011-01-07 1 1 Ithaca Mountain Climbers Jekyll Carbon 2 1 6070 6070 Mountain Over Mountain Carbon
2011-01-07 1 2 Ithaca Mountain Climbers Trigger Carbon 2 1 5970 5970 Mountain Over Mountain Carbon
2011-01-10 2 1 Kansas City 29ers Beast of the East 1 1 2770 2770 Mountain Trail Aluminum
2011-01-10 2 2 Kansas City 29ers Trigger Carbon 2 1 5970 5970 Mountain Over Mountain Carbon
2011-01-10 3 1 Louisville Race Equipment Supersix Evo Hi-Mod Team 1 10660 10660 Road Elite Road Carbon
2011-01-10 3 2 Louisville Race Equipment Jekyll Carbon 4 1 3200 3200 Mountain Over Mountain Carbon
# Group by model & model features, summarize by quantity purchased -------------
library(tidyr)  # Needed for spread function
customerTrends <- orders.extended %>%
  group_by(bikeshop.name, model, category1, category2, frame, price) %>%
  summarise(total.qty = sum(quantity)) %>%
  spread(bikeshop.name, total.qty)
customerTrends[is.na(customerTrends)] <- 0  # Remove NA's


# Convert price to binary high/low category ------------------------------------
library(Hmisc)  # Needed for cut2 function
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
customerTrends$price <- cut2(customerTrends$price, g=2)   



# Convert customer purchase quantity to percentage of total quantity -----------
customerTrends.mat <- as.matrix(customerTrends[,-(1:5)])  # Drop first five columns
customerTrends.mat <- prop.table(customerTrends.mat, margin = 2)  # column-wise pct
customerTrends <- bind_cols(customerTrends[,1:5], as.data.frame(customerTrends.mat))



# Running the k-means algorithm -------------------------------------------------
library(cluster) # Needed for silhouette function

kmeansDat <- customerTrends[,-(1:5)]  # Extract only customer columns
kmeansDat.t <- t(kmeansDat)  # Get customers in rows and products in columns

#View(kmeansDat.t)

# Setup for k-means loop 
km.out <- list()
sil.out <- list()
x <- vector()
y <- vector()
minClust <- 4      # Hypothesized minimum number of segments
maxClust <- 8      # Hypothesized maximum number of segments

# Compute k-means clustering over various clusters, k, from minClust to maxClust
for (centr in minClust:maxClust) {
  i <- centr-(minClust-1) # relevels start as 1, and increases with centr
  set.seed(11) # For reproducibility
  km.out[i] <- list(kmeans(kmeansDat.t, centers = centr, nstart = 50))
  sil.out[i] <- list(silhouette(km.out[[i]][[1]], dist(kmeansDat.t)))
  # Used for plotting silhouette average widths
  x[i] = centr  # value of k
  y[i] = summary(sil.out[[i]])[[4]]  # Silhouette average width
}



# Plot silhouette results to find best number of clusters; closer to 1 is better
library(ggplot2)
ggplot(data = data.frame(x, y), aes(x, y)) + 
  geom_point(size=3) + 
  geom_line() +
  xlab("Number of Cluster Centers") +
  ylab("Silhouette Average Width") +
  ggtitle("Silhouette Average Width as Cluster Center Varies")

# Get customer names that are in each segment ----------------------------------

# Get attributes of optimal k-means output
maxSilRow <- which.max(y)          # Row number of max silhouette value
optimalClusters <- x[maxSilRow]    # Number of clusters
km.out.best <- km.out[[maxSilRow]] # k-means output of best cluster

# Create list of customer names for each cluster
clusterNames <- list()
clusterList <- list()
for (clustr in 1:optimalClusters) {
  clusterNames[clustr] <- paste0("X", clustr)
  clusterList[clustr] <- list(
    names(
      km.out.best$cluster[km.out.best$cluster == clustr]
    )
  )
}
names(clusterList) <- clusterNames

print(clusterList)
## $X1
## [1] "Ithaca Mountain Climbers"     "Pittsburgh Mountain Machines"
## [3] "Tampa 29ers"                 
## 
## $X2
## [1] "Philadelphia Bike Shop" "San Antonio Bike Shop" 
## 
## $X3
## [1] "Ann Arbor Speed"              "Austin Cruisers"             
## [3] "Indianapolis Velocipedes"     "Miami Race Equipment"        
## [5] "Nashville Cruisers"           "New Orleans Velocipedes"     
## [7] "Oklahoma City Race Equipment" "Seattle Race Equipment"      
## 
## $X4
##  [1] "Albuquerque Cycles"    "Dallas Cycles"        
##  [3] "Denver Bike Shop"      "Detroit Cycles"       
##  [5] "Kansas City 29ers"     "Los Angeles Cycles"   
##  [7] "Minneapolis Bike Shop" "New York Cycles"      
##  [9] "Phoenix Bi-peds"       "Portland Bi-peds"     
## [11] "Providence Bi-peds"   
## 
## $X5
## [1] "Cincinnati Speed"          "Columbus Race Equipment"  
## [3] "Las Vegas Cycles"          "Louisville Race Equipment"
## [5] "San Francisco Cruisers"    "Wichita Speed"
# Combine cluster centroids with bike models for feature inspection ------------
custSegmentCntrs <- t(km.out.best$centers)  # Get centroids for groups
colnames(custSegmentCntrs) <- make.names(colnames(custSegmentCntrs))
customerTrends.clustered <- bind_cols(customerTrends[,1:5], as.data.frame(custSegmentCntrs))


# Arrange top 10 bike models by cluster in descending order --------------------
attach(customerTrends.clustered)  # Allows ordering by column name
knitr::kable(head(customerTrends.clustered[order(-X1), c(1:5, 6)], 10))
model category1 category2 frame price X1
Scalpel-Si Carbon 3 Mountain Cross Country Race Carbon [3500,12790] 0.0342692
Jekyll Carbon 4 Mountain Over Mountain Carbon [ 415, 3500) 0.0302818
Scalpel 29 Carbon Race Mountain Cross Country Race Carbon [3500,12790] 0.0280391
Trigger Carbon 3 Mountain Over Mountain Carbon [3500,12790] 0.0259353
Habit Carbon 2 Mountain Trail Carbon [3500,12790] 0.0233750
Trigger Carbon 4 Mountain Over Mountain Carbon [ 415, 3500) 0.0232606
Catalyst 4 Mountain Sport Aluminum [ 415, 3500) 0.0215639
Jekyll Carbon 2 Mountain Over Mountain Carbon [3500,12790] 0.0210792
Supersix Evo Hi-Mod Dura Ace 2 Road Elite Road Carbon [3500,12790] 0.0210578
Trigger Carbon 2 Mountain Over Mountain Carbon [3500,12790] 0.0210433
# Arrange top 10 bike models by cluster in descending order
knitr::kable(head(customerTrends.clustered[order(-X2), c(1:5, 7)], 10))
model category1 category2 frame price X2
Slice Ultegra Road Triathalon Carbon [ 415, 3500) 0.0554531
Trigger Carbon 4 Mountain Over Mountain Carbon [ 415, 3500) 0.0304147
CAAD12 105 Road Elite Road Aluminum [ 415, 3500) 0.0270792
Beast of the East 3 Mountain Trail Aluminum [ 415, 3500) 0.0263331
Trail 1 Mountain Sport Aluminum [ 415, 3500) 0.0249397
Bad Habit 1 Mountain Trail Aluminum [ 415, 3500) 0.0229976
F-Si Carbon 4 Mountain Cross Country Race Carbon [ 415, 3500) 0.0224490
CAAD12 Disc 105 Road Elite Road Aluminum [ 415, 3500) 0.0209568
Synapse Disc 105 Road Endurance Road Aluminum [ 415, 3500) 0.0209568
Trail 2 Mountain Sport Aluminum [ 415, 3500) 0.0203094
detach(customerTrends.clustered)