Illustration by the author of this report.
Greetings all! We are back again yet with another project with yours trully, Gissella Nadya. This is my submission for Unsupervised Learning project. The dataset can be obtained through here or here. Let’s get started!
library(tidyverse)
library(ggplot2)
library(ggpubr)
library(FactoMineR)
library(factoextra) # fviz_pca_ind(), fviz_eig()
library(tictoc)
library(ggradar)
library(scales)
library(plotly)starbucks <- read_csv("starbucks_drink.csv") %>% janitor::clean_names()
head(starbucks)glimpse(starbucks)#> Rows: 2,068
#> Columns: 18
#> $ category <chr> "iced-coffee", "iced-coffee", "iced-coffee", "ic…
#> $ name <chr> "Cold Brew with Cascara Cold Foam", "Cold Brew w…
#> $ portion_fl_oz <dbl> 12, 16, 24, 30, 30, 30, 12, 12, 16, 16, 24, 24, …
#> $ calories <dbl> 50, 80, 100, 130, 160, 5, 60, 0, 80, 5, 120, 5, …
#> $ calories_from_fat <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 25, 25, …
#> $ total_fat_g <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0…
#> $ saturated_fat_g <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0…
#> $ trans_fat_g <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#> $ cholesterol_mg <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, …
#> $ sodium_mg <dbl> 25, 30, 40, 45, 15, 10, 10, 0, 10, 5, 15, 10, 75…
#> $ total_carbohydrate_g <dbl> 11, 17, 22, 28, 40, 0, 15, 0, 20, 0, 30, 0, 3, 4…
#> $ dietary_fiber_g <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
#> $ sugars_g <dbl> 11, 17, 22, 28, 39, 0, 15, 0, 20, 0, 30, 0, 2, 4…
#> $ protein_g <dbl> 1, 2, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 5, …
#> $ caffeine_mg <chr> "145", "190", "280", "320", "280", "330", "120",…
#> $ size <chr> "Tall", "Grande", "Venti Iced", "Trenta Iced", "…
#> $ milk <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
#> $ whipped_cream <chr> NA, NA, NA, NA, "Sweetened", "Unsweetened", "Swe…
To make our data wrangling easier, we are going to make id column.
starbucks <- starbucks %>%
mutate(id = rownames(starbucks))colSums(is.na(starbucks))#> category name portion_fl_oz
#> 0 0 0
#> calories calories_from_fat total_fat_g
#> 0 0 0
#> saturated_fat_g trans_fat_g cholesterol_mg
#> 0 0 0
#> sodium_mg total_carbohydrate_g dietary_fiber_g
#> 0 0 0
#> sugars_g protein_g caffeine_mg
#> 0 0 0
#> size milk whipped_cream
#> 12 271 813
#> id
#> 0
It appears we have missing observations on 3 columns, let’s fix that.
Size
unique(starbucks$size)#> [1] "Tall" "Grande" "Venti Iced" "Trenta Iced" NA
#> [6] "Short" "Venti" "Kids"
Showing the NA column:
starbucks %>%
count(size, portion_fl_oz, sort = T) %>%
arrange(portion_fl_oz) Here, we are assuming that the portion 8 that has NA value considered as short size, 11 tall, and 14 & 14.5 is tall.
nasize <- starbucks[is.na(starbucks$size),] %>%
mutate(size = case_when(portion_fl_oz == "8" ~ "Short",
portion_fl_oz >= "11" ~ "Tall" )) %>%
select(id, size)
clean_sbux <- left_join(starbucks, nasize, by = "id")
clean_sbux <- clean_sbux %>% mutate(size = coalesce(size.x,size.y)) %>% select(-c(size.x,size.y))Milk
colSums(is.na(clean_sbux))#> category name portion_fl_oz
#> 0 0 0
#> calories calories_from_fat total_fat_g
#> 0 0 0
#> saturated_fat_g trans_fat_g cholesterol_mg
#> 0 0 0
#> sodium_mg total_carbohydrate_g dietary_fiber_g
#> 0 0 0
#> sugars_g protein_g caffeine_mg
#> 0 0 0
#> milk whipped_cream id
#> 271 813 0
#> size
#> 0
unique(clean_sbux$milk)#> [1] NA "Almond" "Coconut"
#> [4] "Nonfat milk" "Whole Milk" "2% Milk"
#> [7] "Soy (United States)"
clean_sbux[is.na(clean_sbux$milk),]clean_sbux$milk <- ifelse(is.na(clean_sbux$milk), "None", clean_sbux$milk)Whipped Cream
unique(clean_sbux$whipped_cream)#> [1] NA "Sweetened" "Unsweetened" "No Whipped Cream"
#> [5] "Whipped Cream"
clean_sbux[is.na(clean_sbux$whipped_cream),]clean_sbux$whipped_cream <- ifelse(is.na(clean_sbux$whipped_cream), "No Whipped Cream", clean_sbux$whipped_cream)Caffeine Mg
unique(clean_sbux$caffeine_mg)#> [1] "145" "190" "280" "320" "330" "120" "140" "165"
#> [9] "235" "275" "240" "90" "110" "125" "150" "170"
#> [17] "205" "85" "215" "200" "270" "265" "155" "310"
#> [25] "360" "185" "35–45" "45–55" "70–85" "90–110" "5–85" "35"
#> [33] "45" "70" "0" "50–55" "20–25" "25–30" "40–45" "55"
#> [41] "180" "475" "75" "115" "195" "375" "445" "15"
#> [49] "20" "25" "30" "130" "260" "340" "410" "315"
#> [57] "255" "225" "300" "95" "175" "425" "100" "290"
#> [65] "65" "40" "10" "50" "25–40" "0–15" "15–25" "15–20"
#> [73] "30–35" "80" "40+" "40–60"
clean_sbux$caffeine_mg <- str_trim(str_replace_all(clean_sbux$caffeine_mg,
pattern = "–",
replacement = " - " ))
clean_sbuxnacaffeine <- clean_sbux %>%
filter(grepl(" - ", caffeine_mg)) %>%
select(id, caffeine_mg) %>%
separate(caffeine_mg, sep = " - ", into = c("cmg1", "cmg2")) %>%
mutate(cmg1 = as.numeric(cmg1),
cmg2 = as.numeric(cmg2),
caffeine_mg = (cmg1 + cmg2)/2) %>%
select(id, caffeine_mg)
clean_sbux$caffeine_mg <- gsub("\\+", "", clean_sbux$caffeine_mg)clean_sbux <- left_join(clean_sbux, nacaffeine, by = "id")
clean_sbux$caffeine_mg.x[grepl(" - ", clean_sbux$caffeine_mg.x)] <- NA
clean_sbux <- clean_sbux %>%
mutate(caffeine_mg.x = as.numeric(caffeine_mg.x),
caffeine_mg = coalesce(caffeine_mg.x, caffeine_mg.y)) %>%
select(-c(caffeine_mg.x, caffeine_mg.y))clean_sbux <- clean_sbux %>%
mutate_if(~is.character(.), ~as.factor(.)) %>%
select(-id)unique(clean_sbux$size)#> [1] Tall Grande Venti Iced Trenta Iced Short Venti
#> [7] Kids
#> Levels: Grande Kids Short Tall Trenta Iced Venti Venti Iced
clean_sbux$size <- factor(clean_sbux$size, levels = c("Kids", "Short", "Tall", "Grande", "Venti Iced", "Venti", "Trenta Iced"))clean_sbux <- clean_sbux %>%
mutate(caffeinated = case_when(caffeine_mg == "0" ~ "No",
caffeine_mg > 0 ~ "Yes"),
caffeinated = as.factor(caffeinated))summary(clean_sbux$calories)#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.0 117.5 210.0 223.4 320.0 650.0
clean_sbux <- clean_sbux %>%
mutate(calories_cat = case_when(calories < 117.5 ~ "Low",
calories >= 117.5 & calories <= 320.0 ~ "Medium",
calories > 320.0 ~ "High"),
calories_cat = as.factor(calories_cat))colSums(is.na(clean_sbux))#> category name portion_fl_oz
#> 0 0 0
#> calories calories_from_fat total_fat_g
#> 0 0 0
#> saturated_fat_g trans_fat_g cholesterol_mg
#> 0 0 0
#> sodium_mg total_carbohydrate_g dietary_fiber_g
#> 0 0 0
#> sugars_g protein_g milk
#> 0 0 0
#> whipped_cream size caffeine_mg
#> 0 0 0
#> caffeinated calories_cat
#> 0 0
sbux_num <- clean_sbux %>% select_if(is.numeric)The possibility of Principle Component Analysis (PCA) can be seen through the correlations between each variables using ggcorr() function from GGally package.
Based on this result, we can see that there are some of the variables that has a strong positive correlation. We will try to reduce the dimension using PCA.
str(clean_sbux)#> tibble [2,068 × 20] (S3: tbl_df/tbl/data.frame)
#> $ category : Factor w/ 10 levels "bottled-drinks",..: 6 6 6 6 6 6 6 6 6 6 ...
#> $ name : Factor w/ 119 levels "Blonde Roast",..: 11 11 11 11 30 30 30 30 30 30 ...
#> $ portion_fl_oz : num [1:2068] 12 16 24 30 30 30 12 12 16 16 ...
#> $ calories : num [1:2068] 50 80 100 130 160 5 60 0 80 5 ...
#> $ calories_from_fat : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ total_fat_g : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ saturated_fat_g : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ trans_fat_g : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ cholesterol_mg : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ sodium_mg : num [1:2068] 25 30 40 45 15 10 10 0 10 5 ...
#> $ total_carbohydrate_g: num [1:2068] 11 17 22 28 40 0 15 0 20 0 ...
#> $ dietary_fiber_g : num [1:2068] 0 0 0 0 0 0 0 0 0 0 ...
#> $ sugars_g : num [1:2068] 11 17 22 28 39 0 15 0 20 0 ...
#> $ protein_g : num [1:2068] 1 2 2 2 1 1 0 0 0 0 ...
#> $ milk : Factor w/ 7 levels "2% Milk","Almond",..: 4 4 4 4 4 4 4 4 4 4 ...
#> $ whipped_cream : Factor w/ 4 levels "No Whipped Cream",..: 1 1 1 1 2 3 2 3 2 3 ...
#> $ size : Factor w/ 7 levels "Kids","Short",..: 3 4 5 7 7 7 3 3 4 4 ...
#> $ caffeine_mg : num [1:2068] 145 190 280 320 280 330 120 140 165 190 ...
#> $ caffeinated : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
#> $ calories_cat : Factor w/ 3 levels "High","Low","Medium": 2 2 2 3 3 2 2 2 2 2 ...
a <- ggplot(data = clean_sbux, aes(x = size, y = portion_fl_oz, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "portion_fl_oz")
b <- ggplot(data = clean_sbux, aes(x = size, y = calories, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "calories")
c <- ggplot(data = clean_sbux, aes(x = size, y = calories_from_fat, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "calories_from_fat")
d <- ggplot(data = clean_sbux, aes(x = size, y = total_fat_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "total_fat_g")
e <- ggplot(data = clean_sbux, aes(x = size, y = saturated_fat_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "saturated_fat_g")
f <- ggplot(data = clean_sbux, aes(x = size, y = trans_fat_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "trans_fat_g")
g <- ggplot(data = clean_sbux, aes(x = size, y = cholesterol_mg, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "cholesterol_mg")
h <- ggplot(data = clean_sbux, aes(x = size, y = sodium_mg, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "sodium_mg")
i <- ggplot(data = clean_sbux, aes(x = size, y = total_carbohydrate_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "total_carbohydrate_g")
j <- ggplot(data = clean_sbux, aes(x = size, y = dietary_fiber_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "dietary_fiber_g")
k <- ggplot(data = clean_sbux, aes(x = size, y = sugars_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "sugars_g")
l <- ggplot(data = clean_sbux, aes(x = size, y = protein_g, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "protein_g")
m <- ggplot(data = clean_sbux, aes(x = size, y = caffeine_mg, fill = size)) + geom_boxplot(show.legend = F, alpha = 0.3) + theme_minimal() + labs(title = "caffeine_mg") ggpubr::ggarrange(a,b,c,d,e,f,g,h,i,j,k,l,m,
ncol = 2, nrow = 2)#> $`1`
#>
#> $`2`
#>
#> $`3`
#>
#> $`4`
#>
#> attr(,"class")
#> [1] "list" "ggarrange"
From the barplot, we can see that most of the time the barplot are in order of kids > short > tall > grande > venti iced > venti > trenta iced. But in calories, it appears that Trenta is lower and even lower than the Kids size, alongside with total fat, saturated fat, cholesterol, sodium, dietary fiber, sugars, and protein. It appears that the highest caffeine level is in Trenta size. Highest protein is Venti size. Sugar level is in Venti Iced, the same as total carbs. To find more interesting and undiscovered pattern in the data, we will use clustering method using Clustering or K-Means.
PCA is Principle Component Analysis. The objective of PCA is to find Q, so that such a linear transformation is possible. By using nearZeroVar we are able to show that to eliminate ~50% of the original predictors we still retain enough information. PCA shares the same objective, but does it differently: it looks for correlation within our data and use that redundancy to create a new matrix Z with just enough dimensions to explain most of the variance in the original data. The new variables of matrix Z are called principal components.
# making index for quantitative and qualitative variables
quanti_var <- c(3:14, 18)
quali_var <- c(1:2, 15:17, 19:20)FactoMineR package for function PCA()
pca_sbux <- PCA(X = clean_sbux, # data
scale.unit = T, # scaling
quali.sup = quali_var, # data categorical (the number)
graph = F) # F = not showing the plot
# ncp = 10) # default = 5
pca_sbux$eig#> eigenvalue percentage of variance cumulative percentage of variance
#> comp 1 6.8941466905 53.031897619 53.03190
#> comp 2 1.6929489271 13.022684055 66.05458
#> comp 3 1.4026800523 10.789846557 76.84443
#> comp 4 0.8805313802 6.773318309 83.61775
#> comp 5 0.7706336358 5.927951045 89.54570
#> comp 6 0.6767389122 5.205683940 94.75138
#> comp 7 0.3480422540 2.677248108 97.42863
#> comp 8 0.1499827600 1.153713539 98.58234
#> comp 9 0.1241435841 0.954950647 99.53729
#> comp 10 0.0562579815 0.432753704 99.97005
#> comp 11 0.0016887882 0.012990678 99.98304
#> comp 12 0.0015548463 0.011960356 99.99500
#> comp 13 0.0006501877 0.005001444 100.00000
fviz_eig(pca_sbux, ncp = 13,
addlabels = T, main = "Variance explained by each dimensions")Through the PCA, I can retain some informative PC (high in cumulative variance) to perform dimension reduction. By doing this, I can reduce the dimension of the variables while also retaining as much information as possible. In this project, I would like to retain at least 80% of the data. Therefore I am going to choose Comp 1-4 (83.61775%).
After deciding that we are going to us PC1 to PC4, we can extract the values and make it a new dataframe. It can be later udes as Supervised Learning Classification Technique.
# New df from PCA Result
sbux_df_from_pca <- data.frame(pca_sbux$ind$coord[,1:4])
sbux_df_from_pca_x <- cbind(sbux_df_from_pca, Type = clean_sbux$size)
head(sbux_df_from_pca_x)A biplot, using only two axis, communicates hidden structure that are otherwise difficult to meaningfully visualize or compare. It only consists of 2 PCs, usually PC1 and PC2.
pca_biplot_sbux <- prcomp(sbux_num %>% head(100), scale = F)
biplot(pca_biplot_sbux,
cex = 0.6, # font size
choices = c(1,2), # PC 1 and 2
scale = F) # fviz_pca_biplot(pca_biplot_sbux, repel = T, invisible = "quali")plot.PCA(x = pca_sbux,
choix = "ind", # individual plot
invisible = "quali", # to make the category tables invisible
select = "contrib5", # 5 outliers
habillage = "size") # color the dots depends on ...Through the individual plot of PCA, dim 1 could cover 53.03% variance of data.
We also found the 5 outlier to be (depends on the size):
size Venti : 573, 1037, 1745, 1747size Venti Iced : 453fviz_pca_ind(pca_sbux, habillage = "calories_cat", addEllipses = T)To represent more than two components tha variables will be positioned inside the circle of correlation. If the variable is closer to the circle (outside), that means the variable can reconstruct it better from the first two components. If the variable is closed to the center of the plot (inside), that means the variable is less important for the two components.
plot.PCA(x = pca_sbux,
choix = "var")fviz_pca_var(pca_sbux , select.var = list(contrib = 20), col.var = "contrib",
gradient.cols = c("cyan", "gold", "maroon"), repel = TRUE)fviz_contrib(X = pca_sbux,
choice = "var", # lihat kontribusi berdasarkan variable
axes = 1) # PC yang ingin di tampilkanThe highest contribution to Dimension 1 is calories, and the lowest is caffeine_mg.
The highest contribution to Dimension 1 is portion_fl_oz, and the lowest is protein_g.
The highest contribution to Dimension 1 is caffeine_mg, and the lowest is calories_from_fat.
The highest contribution to Dimension 1 is
dietary_fiber_g, and the lowest is sodium_mg.
dimension <- dimdesc(pca_sbux)
as.data.frame(dimension$Dim.1$quanti)calories gave the strongest positive correlation to PC1 / Dim1 with 0.97241376, while caffeine_mg gave the strongest negative correlation to PC1 Dim1 with -0.09533722.
as.data.frame(dimension$Dim.2$quanti)On the second dimension / PC2, the strongest positive correlation is portion_fl_oz with 0.6036598, and the strongest negative correlation is cholesterol_mg with -0.4280593.
as.data.frame(dimension$Dim.3$quanti)On the third dimension / PC3, the strongest positive correlation is caffeine_mg with 0.83899020, and the strongest negative correlation is dietary_fiber_g with -0.26284407.
Clustering must only use the numerical variables, therefore we will use sbuxnum.
head(sbux_num)sbux_num_z <- scale(sbux_num)# build the loop from k=2 until k=15 by 1
k_sequences <- seq(2,15,1)
k_num <- length(k_sequences)
# make a blank table for k, wss and btration
kmeans_df <- tibble(k = rep(0, k_num),
wss = rep(0, k_num),
btratio = rep(0, k_num))
# function to try to find k optimum
for(i in 1:k_num){
k <- k_sequences[i]
loop_sbux_kmeans <- kmeans(sbux_num_z, i) # looping i = depends on k_num (length)
kmeans_df[i,"k"] <- k # fill the k values
kmeans_df[i,"wss"] <- loop_sbux_kmeans$tot.withinss # fill the wss value
kmeans_df[i,"btratio"] <- loop_sbux_kmeans$betweenss/loop_sbux_kmeans$totss #store btratio value
}
# draw the plot from loop
wss.p <- ggplot(data = kmeans_df, aes(x = k, y = wss)) + geom_point() + geom_line()+
labs(title = "Within sum of square")
btratio.p <- ggplot(data = kmeans_df, aes(x = k, y = btratio)) + geom_point() + geom_line()+
labs(title = "betweenss/totalss")
cowplot::plot_grid(wss.p, btratio.p)Through the elbow method we can see that the k optimum is 4.
RNGkind(sample.kind = "Rounding")
set.seed(598)
sbux_num_km <- kmeans(x = sbux_num_z,
centers = 4)The amount of observations for each clusters are:
sbux_num_km$size#> [1] 877 448 306 437
The centroid, usually used for cluster profiling
sbux_num_km$centers#> portion_fl_oz calories calories_from_fat total_fat_g saturated_fat_g
#> 1 -0.2185002 -0.9189924 -0.7399713 -0.7384593 -0.6905663
#> 2 -0.4956848 0.2741426 0.6689763 0.6624429 0.6839741
#> 3 0.6340108 1.6342356 1.7194258 1.7225746 1.6505906
#> 4 0.5027096 0.4189116 -0.4047846 -0.4033260 -0.4711086
#> trans_fat_g cholesterol_mg sodium_mg total_carbohydrate_g dietary_fiber_g
#> 1 -0.1786636 -0.6602144 -0.8480530 -0.84852546 -0.41331943
#> 2 -0.1786636 0.7222773 0.1292993 -0.01670122 -0.04235576
#> 3 1.0287754 1.6636736 1.3515311 1.33159483 0.71211777
#> 4 -0.1786636 -0.5804493 0.6229929 0.78757655 0.37425283
#> sugars_g protein_g caffeine_mg
#> 1 -0.841216464 -0.5695935 0.174616975
#> 2 -0.009072493 0.3572473 -0.366609783
#> 3 1.310668007 0.8012886 0.030491328
#> 4 0.779741204 0.2157721 0.004054345
#> [1] "The iteration occurs: 5 times until the clusters are stable"
A good clustering results can be looked by 3 aspects, Within Sum of Squares ($withinss), Between Sum of Squares ($betweenss) and Total Sum of Squares ($totss)
sbux_num_km$withinss#> [1] 3869.895 2056.122 4063.169 2731.760
#> [1] "BSS = 14150.054496345"
#> [1] "TSS = 26871"
A good clustering should have:
#> [1] "BSS/TSS = 1.11234298521909"
clean_sbux$cluster <- sbux_num_km$cluster
tail(clean_sbux %>% select(category, name, milk, cluster))Profiling are often used to give product recommendations to the customers.
profile <- clean_sbux %>%
group_by(cluster) %>%
summarise_all("mean") %>%
select_if(~ !any(is.na(.)))
profileInsights profiling =
Cluster 1 = Medium portion, Lowest calories, cholesterol, sodium, carbohydrate, dietary fiber, sugar, protein, Highly caffeinated.Cluster 2 = Smallest portion, High calorie, cholesterol, Low dietary fiber, sugar, protein, Lowest caffeine.Cluster 3 = Biggest portion, High caffeine, Highest calories, cholesterol, sodium, carbohydrate, dietary fiber, sugar, protein.Cluster 4 = Big portion, High calories, sodium, carbohydrate, sugar, caffeine, Low cholesterol.To be able to see a better visualization, we could make a radar plot.
sbux_radar <- clean_sbux %>%
group_by(cluster) %>%
summarise_all("mean") %>%
select_if(~ !any(is.na(.))) %>%
rename(group = cluster) %>%
mutate(group = as.character(group)) %>%
mutate_at(vars(-group),
funs(rescale))
ggradar(sbux_radar,
grid.label.size = 3,
axis.label.size = 3.2,
group.point.size = 5,
group.line.width = 1,
legend.text.size = 10,
plot.title = "Starbucks Nutrion Facts Clustering",
legend.position = "right")fviz_cluster(object = sbux_num_km, #kmeans
data = sbux_num_z, # numerical
labelsize = 2) + theme_minimal()# sbux_df_from_pca
df_plot <- cbind(sbux_df_from_pca, cluster = clean_sbux$cluster,
name = clean_sbux$name, category = clean_sbux$category,
milk = clean_sbux$milk,
whipped_cream = clean_sbux$whipped_cream,
caffeinated = clean_sbux$caffeinated,
size = clean_sbux$size)
df_plot <- df_plot %>%
mutate(cluster = as.factor(cluster))After exploring our dataset by using PCA and K-Means for clustering, we are able to conclude that:
Thank you for taking the time to read my project, I hope it could be useful for you as well. Don’t forget to connect with me at my LinkedIn profile. I cannot wait to discuss more about machine learning with you! Hope you have a great day.