#https://datatables.net/reference/option/
options(DT.options = list(scrollX = TRUE, pagin=TRUE, fixedHeader = TRUE, searchHighlight = TRUE))
Check out this Kaggle webpage
In one piped statement:
a = read_csv('Mall_Customers.csv') %>% #1
mutate(across(where(is.character),as.factor)) %>% #2
clean_names(.) %>% #3
select(sort(tidyselect::peek_vars())) %>% #4
select(where(is.factor), where(is.numeric)) %>% #5
select(-customer_id)
#Split Data
set.seed(321)
split = a %>% initial_split()
train = split %>% training()
test = split %>% testing()
train %>% select(where(is.factor)) %>% head %>% DT::datatable()
### glimpse structure
train %>% select(where(is.factor)) %>% glimpse
## Rows: 150
## Columns: 1
## $ gender <fct> Male, Male, Female, Female, Female, Female, Female, Male, Fe...
train %>% select(where(is.factor)) %>% miss_var_summary()
sapply(train %>% select(where(is.factor)), n_unique)
## gender
## 2
sapply(train %>% select(where(is.factor)), unique)
## gender
## [1,] "Male"
## [2,] "Female"
train = train %>% mutate(gender = if_else(gender == 'Male', 1, 0))
ggplotly(
train %>% count(gender = factor(gender)) %>%
mutate(percent = n/nrow(train)) %>%
ggplot(aes(percent, gender, fill = gender)) +
geom_col() +
scale_x_continuous(labels = scales::percent) +
labs(x = '', y = '', title ='Gender Percent Breakdown: 1 = Male, 0 = Female') +
theme(legend.position = 'none')
)
train %>% select(where(is.numeric)) %>% head %>% DT::datatable()
train %>% select(where(is.numeric)) %>% glimpse
## Rows: 150
## Columns: 4
## $ gender <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0...
## $ age <dbl> 19, 21, 20, 31, 22, 35, 23, 64, 30, 67, 35, 24...
## $ annual_income_k <dbl> 15, 15, 16, 17, 17, 18, 18, 19, 19, 19, 19, 20...
## $ spending_score_1_100 <dbl> 39, 81, 6, 40, 76, 6, 94, 3, 72, 14, 99, 77, 1...
miss_var_summary(train %>% select(where(is.numeric)))
DataExplorer::plot_boxplot(train %>% select(where(is.numeric), gender), by = 'gender')
DataExplorer::plot_histogram(train %>% select(where(is.numeric)))
DataExplorer::plot_density(train %>% select(where(is.numeric)))
GGally::ggcorr(train %>% select(where(is.numeric)), low = '#990000', mid = '#E0E0E0', high = '#009900', label = TRUE)
Reference: (package recipes)[https://recipes.tidymodels.org/reference/index.html]
### normalize data so certain features aren't unfairly weighted
train.normalized = train %>% scale
factoextra::fviz_nbclust(
train.matrix,
diss = dist(train.matrix, method = "euclidean"),
FUNcluster=kmeans,
method="silhouette"
) +
theme_classic()
factoextra::fviz_nbclust(
train.matrix,
diss = dist(train.matrix, method = "manhattan"),
FUNcluster=kmeans,
method="silhouette"
) +
theme_classic()
factoextra::fviz_nbclust(
train.matrix,
diss = dist(train.matrix, method = "euclidean"),
FUNcluster=cluster::pam,
method="silhouette"
) +
theme_classic()
factoextra::fviz_nbclust(
train.matrix,
diss = dist(train.matrix, method = "manhattan"),
FUNcluster=cluster::pam,
method="silhouette"
) +
theme_classic()
dist.train = dist(train, method = 'euclidean')
#ward.D2, creates groups such that variance is minimized within clusters, dissimilarities are squared before clustering
hclust.train = hclust(dist.train, method = 'ward.D2')
#hclust.train = hclust(del.distanced, method = 'complete')
plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL)
h = 75
#cutree(hclust.train, h = h) #height
#cutree(hclust.train, k = 9) #clusters
plot(hclust.train, hang = -1, cex = 0.8, main = NULL, xlab = NULL);abline(h = h, col = 'blue', lty = 2)
#At a given height cutoff of x
#Clusters below height x have a [dist.method] [hclust.method] distance <= height cutoff
hclust.colors = RColorBrewer::brewer.pal(9, 'Paired') #choose # of cols corresponding to optimal number of clusters
hclust.clusters = cutree(hclust.train, 9)
plot(as.phylo(hclust.train), type = 'fan', tip.color = hclust.colors[hclust.clusters],
label.offset = 2, cex = 1.0)
plot(ape::as.phylo(hclust.train), type = 'unrooted', tip.color = hclust.colors[hclust.clusters],
label.offset = 2, cex = 1.0, no.margin = TRUE)
km = eclust(
train.matrix,
FUNcluster="kmeans",
k=9,
hc_metric = "manhattan"
)
pam = eclust(
train.matrix,
FUNcluster="pam",
k=9,
hc_metric = "manhattan"
)
train = train %>% mutate(cluster = factor(pam$cluster, levels = 1:9))
train %>%
mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>%
group_by(cluster, gender) %>%
summarise(
mean.age = mean(age, na.rm = TRUE),
mean.income = mean(annual_income_k, na.rm = TRUE),
mean.spending.score = mean(spending_score_1_100, na.rm = TRUE),
count = n()
)
ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, annual_income_k, fill = cluster)) + geom_boxplot() + facet_wrap(~gender) +
scale_y_continuous(breaks = seq(
min(train$annual_income_k),
max(train$annual_income_k), 10
)))
ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, age, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))
ggplotly(train %>% mutate(gender = factor(if_else(gender == 1, 'Male', 'Female'))) %>% ggplot(aes(cluster, spending_score_1_100, fill = cluster)) + geom_boxplot() + facet_wrap(~gender))
xvars = train %>% select(-cluster) %>% names %>% as.character()
jpal = colorRampPalette(RColorBrewer::brewer.pal(8,'Dark2'))(25)
train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[2])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[2],' by cluster'), xaxis = list(title = xvars[2]))
train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[3])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[3],' by cluster'), xaxis = list(title = xvars[3]))
train %>% plot_ly(y = ~cluster, x = ~eval(as.name(xvars[4])), color = ~cluster, colors = jpal) %>% add_boxplot() %>% hide_legend() %>% layout(
title = paste0(xvars[4],' by cluster'), xaxis = list(title = xvars[4]))