#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))

Intro

Check out this Kaggle webpage

In one piped statement:

  1. read in data
  2. convert char to factor vars
  3. rename all colnames lowercase
  4. order cols by name: alphabetically
  5. order cols by datatype: nominal, then numeric

Get Data

a = read_csv('Mall_Customers.csv') %>%  #1
  mutate(across(where(is.character),as.factor)) %>% #2
  clean_names(.) %>% #3
  select(sort(tidyselect::peek_vars())) %>% #4
  select(where(is.factor), where(is.numeric)) %>%  #5
  select(-customer_id)

#Split Data

set.seed(321)
split = a %>% initial_split()
train = split %>% training()
test = split %>% testing()

EDA: nom vars

check head rows

train %>% select(where(is.factor)) %>% head %>% DT::datatable()

### glimpse structure

train %>% select(where(is.factor)) %>% glimpse
## Rows: 150
## Columns: 1
## $ gender <fct> Male, Male, Female, Female, Female, Female, Female, Male, Fe...

check for missing values

train %>% select(where(is.factor)) %>% miss_var_summary()

distribution: counts of unique levels

sapply(train %>% select(where(is.factor)), n_unique)
## gender 
##      2

reference: names of unique levels

sapply(train %>% select(where(is.factor)), unique)
##      gender  
## [1,] "Male"  
## [2,] "Female"

binarize gender to numeric var

train = train %>% mutate(gender = if_else(gender == 'Male', 1, 0))

distribution: viz

ggplotly(
train %>% count(gender = factor(gender)) %>%
  mutate(percent = n/nrow(train)) %>%
  ggplot(aes(percent, gender, fill = gender)) +
  geom_col() +
  scale_x_continuous(labels = scales::percent) +
  labs(x = '', y = '', title ='Gender Percent Breakdown: 1 = Male, 0 = Female') +
  theme(legend.position = 'none')
)

EDA: num vars

check head rows

train %>% select(where(is.numeric)) %>% head %>% DT::datatable()

glimpse structure

train %>% select(where(is.numeric)) %>% glimpse
## Rows: 150
## Columns: 4
## $ gender               <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0...
## $ age                  <dbl> 19, 21, 20, 31, 22, 35, 23, 64, 30, 67, 35, 24...
## $ annual_income_k      <dbl> 15, 15, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20...
## $ spending_score_1_100 <dbl> 39, 81, 6, 40, 76, 6, 94, 3, 72, 14, 99, 77, 1...

check for missing values

miss_var_summary(train %>% select(where(is.numeric)))

distribution: viz

DataExplorer::plot_boxplot(train %>% select(where(is.numeric), gender), by = 'gender')

distribution: viz

DataExplorer::plot_histogram(train %>% select(where(is.numeric)))

distribution: viz

DataExplorer::plot_density(train %>% select(where(is.numeric)))

pairwise correlations: viz

GGally::ggcorr(train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)

Preprocessing

Reference: (package recipes)[https://recipes.tidymodels.org/reference/index.html]

normalize data

### normalize data so certain features aren't unfairly weighted
train.normalized = train %>% scale

Create matrix

Determine Optimal Number of Clusters

Reference

1) silhouette analysis with kmeans and euclidean distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "euclidean"),
  FUNcluster=kmeans,
  method="silhouette"
  ) +
  theme_classic()

2) silhouette analysis with kmeans and manhattan distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "manhattan"),
  FUNcluster=kmeans,
  method="silhouette"
  ) +
  theme_classic()

3) silhouette analysis with pam and euclidean distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "euclidean"),
  FUNcluster=cluster::pam,
  method="silhouette"
  ) +
  theme_classic()

4) silhouette analysis with pam and manhattan distancing

factoextra::fviz_nbclust(
  train.matrix,
  diss = dist(train.matrix, method = "manhattan"),
  FUNcluster=cluster::pam,
  method="silhouette"
  ) +
  theme_classic()

Clustering via Hierarchial Methods

Reference 1 Reference 2

dist.train = dist(train, method = 'euclidean')

#ward.D2, creates groups such that variance is minimized within clusters, dissimilarities are squared before clustering
hclust.train = hclust(dist.train, method = 'ward.D2') 
#hclust.train = hclust(del.distanced, method = 'complete') 

plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL)

h = 75

#cutree(hclust.train, h = h) #height
#cutree(hclust.train, k = 9) #clusters

plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL);abline(h = h, col = 'blue', lty = 2)

#At a given height cutoff of x
#Clusters below height x  have a [dist.method] [hclust.method] distance <= height cutoff

hclust.colors = RColorBrewer::brewer.pal(9, 'Paired') #choose # of cols corresponding to optimal number of clusters

hclust.clusters = cutree(hclust.train, 9)

plot(as.phylo(hclust.train), type = 'fan', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0)

plot(ape::as.phylo(hclust.train), type = 'unrooted', tip.color = hclust.colors[hclust.clusters],
     label.offset = 2, cex = 1.0, no.margin = TRUE)

Clustering via Centroid Based Methods

Kmeans

km = eclust(
  train.matrix,
  FUNcluster="kmeans",
  k=9,
  hc_metric = "manhattan"
  )

PAM

pam = eclust(
  train.matrix,
  FUNcluster="pam",
  k=9,
  hc_metric = "manhattan"
  )

Summarize Clustering Results

summary: data

train = train %>% mutate(cluster = factor(pam$cluster, levels = 1:9))

train %>%
  mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>%
  group_by(cluster, gender) %>%
  summarise(
    mean.age = mean(age, na.rm = TRUE),
    mean.income = mean(annual_income_k, na.rm = TRUE),
    mean.spending.score = mean(spending_score_1_100, na.rm = TRUE),
    count = n()
  )

summary: viz

ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, annual_income_k, fill = cluster)) + geom_boxplot() + facet_wrap(~gender) +
           scale_y_continuous(breaks = seq(
             min(train$annual_income_k),
             max(train$annual_income_k), 10
             )))
ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, age, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))
ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, spending_score_1_100, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))
xvars = train %>% select(-cluster) %>% names %>% as.character()

jpal = colorRampPalette(RColorBrewer::brewer.pal(8,'Dark2'))(25)

train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[2])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[2],' by cluster'), xaxis = list(title = xvars[2]))
train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[3])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[3],' by cluster'), xaxis = list(title = xvars[3]))
train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[4])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
  title = paste0(xvars[4],' by cluster'), xaxis = list(title = xvars[4]))