#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Intro

Check out this Kaggle webpage

In one piped statement:

read in data
convert char to factor vars
rename all colnames lowercase
order cols by name: alphabetically
order cols by datatype: nominal, then numeric

Get Data

a = read_csv('Mall_Customers.csv') %>%  #1
  mutate(across(where(is.character),as.factor)) %>% #2
  clean_names(.) %>% #3
  select(sort(tidyselect::peek_vars())) %>% #4
  select(where(is.factor), where(is.numeric)) %>%  #5
  select(-customer_id)

#Split Data

set.seed(321)
split = a %>% initial_split()
train = split %>% training()
test = split %>% testing()

EDA: nom vars

check head rows

train %>% select(where(is.factor)) %>% head %>% DT::datatable()

### glimpse structure

train %>% select(where(is.factor)) %>% glimpse

## Rows: 150
## Columns: 1
## $ gender <fct> Male, Male, Female, Female, Female, Female, Female, Male, Fe...

check for missing values

train %>% select(where(is.factor)) %>% miss_var_summary()

distribution: counts of unique levels

sapply(train %>% select(where(is.factor)), n_unique)

## gender 
##      2

reference: names of unique levels

sapply(train %>% select(where(is.factor)), unique)

##      gender  
## [1,] "Male"  
## [2,] "Female"

binarize gender to numeric var

train = train %>% mutate(gender = if_else(gender == 'Male', 1, 0))

distribution: viz

ggplotly(
train %>% count(gender = factor(gender)) %>%
  mutate(percent = n/nrow(train)) %>%
  ggplot(aes(percent, gender, fill = gender)) +
  geom_col() +
  scale_x_continuous(labels = scales::percent) +
  labs(x = '', y = '', title ='Gender Percent Breakdown: 1 = Male, 0 = Female') +
  theme(legend.position = 'none')
)

EDA: num vars

check head rows

train %>% select(where(is.numeric)) %>% head %>% DT::datatable()

glimpse structure

train %>% select(where(is.numeric)) %>% glimpse

## Rows: 150
## Columns: 4
## $ gender               <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0...
## $ age                  <dbl> 19, 21, 20, 31, 22, 35, 23, 64, 30, 67, 35, 24...
## $ annual_income_k      <dbl> 15, 15, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20...
## $ spending_score_1_100 <dbl> 39, 81, 6, 40, 76, 6, 94, 3, 72, 14, 99, 77, 1...

check for missing values

miss_var_summary(train %>% select(where(is.numeric)))

distribution: viz

DataExplorer::plot_boxplot(train %>% select(where(is.numeric), gender), by = 'gender')

distribution: viz

DataExplorer::plot_histogram(train %>% select(where(is.numeric)))

distribution: viz

DataExplorer::plot_density(train %>% select(where(is.numeric)))

pairwise correlations: viz

GGally::ggcorr(train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)

Preprocessing

Reference: (package recipes)[https://recipes.tidymodels.org/reference/index.html]

normalize data

### normalize data so certain features aren't unfairly weighted
train.normalized = train %>% scale

Create matrix

Determine Optimal Number of Clusters

Reference

1) silhouette analysis with kmeans and euclidean distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "euclidean"),
  FUNcluster=kmeans,
  method="silhouette"
  ) +
  theme_classic()

2) silhouette analysis with kmeans and manhattan distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "manhattan"),
  FUNcluster=kmeans,
  method="silhouette"
  ) +
  theme_classic()

3) silhouette analysis with pam and euclidean distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "euclidean"),
  FUNcluster=cluster::pam,
  method="silhouette"
  ) +
  theme_classic()

4) silhouette analysis with pam and manhattan distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "manhattan"),
  FUNcluster=cluster::pam,
  method="silhouette"
  ) +
  theme_classic()

Clustering via Hierarchial Methods

Reference 1 Reference 2

dist.train = dist(train, method = 'euclidean')

#ward.D2, creates groups such that variance is minimized within clusters, dissimilarities are squared before clustering
hclust.train = hclust(dist.train, method = 'ward.D2') 
#hclust.train = hclust(del.distanced, method = 'complete') 

plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL)

h = 75

#cutree(hclust.train, h = h) #height
#cutree(hclust.train, k = 9) #clusters

plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL);abline(h = h, col = 'blue', lty = 2)

#At a given height cutoff of x
#Clusters below height x  have a [dist.method] [hclust.method] distance <= height cutoff

hclust.colors = RColorBrewer::brewer.pal(9, 'Paired') #choose # of cols corresponding to optimal number of clusters

hclust.clusters = cutree(hclust.train, 9)

plot(as.phylo(hclust.train), type = 'fan', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0)

plot(ape::as.phylo(hclust.train), type = 'unrooted', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0, no.margin = TRUE)

Clustering via Centroid Based Methods

Kmeans

km = eclust(
  train.matrix,
  FUNcluster="kmeans",
  k=9,
  hc_metric = "manhattan"
  )

PAM

pam = eclust(
  train.matrix,
  FUNcluster="pam",
  k=9,
  hc_metric = "manhattan"
  )

Summarize Clustering Results

summary: data

train = train %>% mutate(cluster = factor(pam$cluster, levels = 1:9))

train %>%
  mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>%
  group_by(cluster, gender) %>%
  summarise(
    mean.age = mean(age, na.rm = TRUE),
    mean.income = mean(annual_income_k, na.rm = TRUE),
    mean.spending.score = mean(spending_score_1_100, na.rm = TRUE),
    count = n()
  )

summary: viz

ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, annual_income_k, fill = cluster)) + geom_boxplot() + facet_wrap(~gender) +
           scale_y_continuous(breaks = seq(
             min(train$annual_income_k),
             max(train$annual_income_k), 10
             )))

ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, age, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))

ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, spending_score_1_100, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))

xvars = train %>% select(-cluster) %>% names %>% as.character()

jpal = colorRampPalette(RColorBrewer::brewer.pal(8,'Dark2'))(25)

train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[2])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[2],' by cluster'), xaxis = list(title = xvars[2]))

train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[3])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[3],' by cluster'), xaxis = list(title = xvars[3]))

train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[4])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[4],' by cluster'), xaxis = list(title = xvars[4]))

Clustering Mall Customers

Jeremiah W

October 30 2020

Intro

Get Data

EDA: nom vars

check head rows

check for missing values

distribution: counts of unique levels

reference: names of unique levels

binarize gender to numeric var

distribution: viz

EDA: num vars

check head rows

glimpse structure

check for missing values

distribution: viz

distribution: viz

distribution: viz

pairwise correlations: viz

Preprocessing

normalize data

Create matrix

Determine Optimal Number of Clusters

1) silhouette analysis with kmeans and euclidean distancing

2) silhouette analysis with kmeans and manhattan distancing

3) silhouette analysis with pam and euclidean distancing

4) silhouette analysis with pam and manhattan distancing

Clustering via Hierarchial Methods

Clustering via Centroid Based Methods

Kmeans

PAM

Summarize Clustering Results

summary: data

summary: viz