mydata <- read.csv("C:/Users/Tajda/Downloads/burger-king-menu (2).csv") [,c(1,2,3,5,12,13)]
colnames(mydata) <- c ("Item", "Category", "Calories", "Fat", "Sugars", "Proteins")
head(mydata)
## Item Category Calories Fat Sugars Proteins
## 1 Whopper® Sandwich Burgers 660 40 11 28
## 2 Whopper® Sandwich with Cheese Burgers 740 46 11 32
## 3 Bacon & Cheese Whopper® Sandwich Burgers 790 51 11 35
## 4 Double Whopper® Sandwich Burgers 900 58 11 48
## 5 Double Whopper® Sandwich with Cheese Burgers 980 64 11 52
## 6 Triple Whopper® Sandwich Burgers 1130 75 11 67
Unit of observation: menu items offered by Burger King,
Sample size: 77 (units of observation).
Categories: breakfast, chicken, burgers
Calories: number of calories (kcal) is in each menu item (meal)
Fat: how many grams (g) of fats is in each menu item.
Sugars:how many grams (g) of sugars is in each menu item.
Proteins:how many grams (g) of proteins is in each menu item.
Source: https://www.kaggle.com/datasets/mattop/burger-king-menu-nutrition-data?select=burger-king-menu.csv
Clustering
# standardization of variables is not needed since all are presented in the same units (grams).
mydata$CategoryFactor <- factor(mydata$Category,
labels = c ("Burgers", "Chicken", "Breakfast"),
levels = c ("Burgers", "Chicken", "Breakfast"))
library(Hmisc)
rcorr(as.matrix(mydata[, c("Fat", "Sugars", "Proteins")]),
type = "pearson")
## Fat Sugars Proteins
## Fat 1.00 0.3 0.91
## Sugars 0.30 1.0 0.30
## Proteins 0.91 0.3 1.00
##
## n= 77
##
##
## P
## Fat Sugars Proteins
## Fat 0.0083 0.0000
## Sugars 0.0083 0.0072
## Proteins 0.0000 0.0072
# FINDING OUTLIERS
mydata$Dissimilarity <- sqrt(mydata$Fat^2 + mydata$Sugars^2 + mydata$Proteins^2)
##10 units with the highest value of dissimilarity
head(mydata[order(-mydata$Dissimilarity), ], 10)
## Item Category Calories Fat Sugars Proteins CategoryFactor Dissimilarity
## 7 Triple Whopper® Sandwich with Cheese Burgers 1220 82 11 71 Burgers 109.02293
## 10 Cheddar Bacon King Sandwich Burgers 1190 84 11 64 Burgers 106.17438
## 6 Triple Whopper® Sandwich Burgers 1130 75 11 67 Burgers 101.16818
## 9 Bacon King Sandwich Burgers 1150 79 10 61 Burgers 100.30952
## 14 Double Stacker King Burgers 1050 68 11 61 Burgers 92.01087
## 37 Spicy Chicken Nuggets- 20 pc Chicken 1050 74 1 40 Chicken 84.12491
## 5 Double Whopper® Sandwich with Cheese Burgers 980 64 11 52 Burgers 83.19255
## 12 Double Quarter Pound King Sandwich Burgers 900 54 11 56 Burgers 78.56844
## 4 Double Whopper® Sandwich Burgers 900 58 11 48 Burgers 76.08548
## 33 Chicken Nuggets- 20pc Chicken 860 54 1 39 Chicken 66.61832
Looking at the dissimilarity we might be able to identify some outliers because there is huge gap from 100 to 92.
mydata <- mydata[mydata$Dissimilarity < 100, ] #Removing observations (7, 10, 6 and 9) based on the dissimilarity.
library(factoextra)
# CALCULATING EUCLIDIAN DISTANCES
distance <- get_dist(mydata[c("Fat", "Sugars", "Proteins")],
method = "euclidian")
distance2 <- distance^2
#Showing dissimilarity matrix
fviz_dist(distance2)
# HOPKINS STATISTICS
get_clust_tendency(mydata[c("Fat", "Sugars", "Proteins")],
n = nrow(mydata) - 1,
graph = FALSE)
## $hopkins_stat
## [1] 0.7998486
##
## $plot
## NULL
H0: There are no natural groups of objects.
H1: There are natural groups of objects.
The hopkins statistic should be above 0.5, in our case it is 0.80, which is very good.
library(dplyr)
WARD <- mydata[c("Fat", "Sugars", "Proteins")] %>%
#scale() %>%
get_dist(method = "euclidean") %>% #Selecting distance (Euclidian)
hclust(method = "ward.D2") #Selecting algorithm (D2 automatically squares distances)
WARD
##
## Call:
## hclust(d = ., method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 73
library(factoextra)
fviz_dend(WARD)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
fviz_dend(WARD,
k = 2,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
fviz_dend(WARD,
k = 3,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
fviz_dend(WARD,
k = 4,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
fviz_dend(WARD,
k = 5,
cex = 0.5,
palette = "jama",
color_labels_by_k = TRUE,
rect = TRUE)
library(NbClust)
OptNumber <- mydata[c("Fat", "Sugars", "Proteins")] %>%
#scale() %>%
NbClust(distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "ward.D2",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 4 proposed 5 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 2 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
From the analysis of dendograms and the test for number of clusters we have identified that 2 is the ideal number of clusters.
# CUTTING THE DENDROGRAM, based on dendromgram we decided to cut at 2.
mydata$ClusterWard <- cutree(WARD,
k = 2)
head(mydata)
## Item Category Calories Fat Sugars Proteins CategoryFactor Dissimilarity ClusterWard
## 1 Whopper® Sandwich Burgers 660 40 11 28 Burgers 50.04998 1
## 2 Whopper® Sandwich with Cheese Burgers 740 46 11 32 Burgers 57.10517 1
## 3 Bacon & Cheese Whopper® Sandwich Burgers 790 51 11 35 Burgers 62.82515 1
## 4 Double Whopper® Sandwich Burgers 900 58 11 48 Burgers 76.08548 1
## 5 Double Whopper® Sandwich with Cheese Burgers 980 64 11 52 Burgers 83.19255 1
## 8 Whopper JR.® Sandwich Burgers 310 18 7 13 Burgers 23.28089 2
library(factoextra)
#Performing K-Means clustering - initial leaders are chosen as centroids of groups, found with hierarchical clustering.
K_MEANS <- hkmeans(mydata[c("Fat", "Sugars", "Proteins")],
k = 2,
hc.metric = "euclidean",
hc.method = "ward.D2")
K_MEANS
## Hierarchical K-means clustering with 2 clusters of sizes 30, 43
##
## Cluster means:
## Fat Sugars Proteins
## 1 45.80000 8.966667 30.833333
## 2 16.05814 4.627907 9.813953
##
## Clustering vector:
## 1 2 3 4 5 8 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
## 1 1 1 1 1 2 1 1 1 1 1 2 2 2 2 1 2 2 1 1 1 1 1 2 2 2 2 2 1 2 2 1 1 2 2 2 2 2 2 2
## 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
## 2 1 2 2 1 1 1 1 1 2 1 2 2 2 1 1 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 8927.933 6587.663
## (between_SS / total_SS = 60.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault" "data" "hclust"
The ratio between the sum of squares and total sum of squares is only 60.6 %. It represents the ratio of variability between groups and total.
# VISUALISING (73 units are now separated into 2 groups.)
fviz_cluster(K_MEANS,
palette = "jama",
repel = FALSE,
ggtheme = theme_classic())
mydata$ClusterK_Means <- K_MEANS$cluster
head(mydata[c("Item", "ClusterWard", "ClusterK_Means")])
## Item ClusterWard ClusterK_Means
## 1 Whopper® Sandwich 1 1
## 2 Whopper® Sandwich with Cheese 1 1
## 3 Bacon & Cheese Whopper® Sandwich 1 1
## 4 Double Whopper® Sandwich 1 1
## 5 Double Whopper® Sandwich with Cheese 1 1
## 8 Whopper JR.® Sandwich 2 2
#Checking for reclassification)
table(mydata$ClusterWard, mydata$ClusterK_Means)
##
## 1 2
## 1 29 1
## 2 1 42
From the table above we can say there was some reclassification after preforming K - means clustering. One unit was reclassified from group 1 to group 2 and also one from group 2 to group 1.
Centroids <- K_MEANS$centers
Centroids
## Fat Sugars Proteins
## 1 45.80000 8.966667 30.833333
## 2 16.05814 4.627907 9.813953
Final leaders are used to explain the groups. We can see that in group 1 all three cluster variable numbers are positive, which means group 1 is above average all the time. The same is for group 2 -> all cluster variable numbers are positive which means group 2 is also above average all the time. Items in group 1 have higer averages of fat, sugars and proteins compared to group 2.
library(ggplot2)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Fat, Sugars, Proteins))
Figure$Groups <- factor(Figure$id,
levels = c(1, 2),
labels = c("1", "2"))
Figure$nameFactor <- factor(Figure$name,
levels = c("Fat", "Sugars", "Proteins"),
labels = c("Fat", "Sugars", "Proteins"))
ggplot(Figure, aes(x = nameFactor, y = value)) +
geom_hline(yintercept = 0) +
theme_bw() +
geom_point(aes(shape = Groups, col = Groups), size = 3) +
geom_line(aes(group = id), linewidth = 1) +
ylab("Averages") +
xlab("Cluster variables")
Here we graphically show that menu items that are in group 1, have above average values of grams of fat, sugars and proteins and same for group 2, menu items from group have above average values of grams of fat, sugars and proteins. We can conclude that menu items in cluster 1 are more caloric (have more grams of fat, sugars and proteins) than in cluster 2 since averages of all the cluster variables are higher.
# ANOVA to check if all cluster variables are successful at classifying units into groups.
fit <- aov(cbind(Fat, Sugars, Proteins) ~ as.factor(ClusterK_Means),
data = mydata)
summary(fit)
## Response Fat :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 1 15631.6 15631.6 178.15 < 2.2e-16 ***
## Residuals 71 6229.9 87.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Sugars :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 1 332.7 332.66 7.1767 0.009171 **
## Residuals 71 3291.0 46.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Response Proteins :
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(ClusterK_Means) 1 7807.4 7807.4 92.47 1.725e-14 ***
## Residuals 71 5994.7 84.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
HO: Ar.mean of groups are equal (g1 = g2 = gr3).
H1: At least one ar.mean is different.
We have 3 variables –> 3 separate anova tests. All of the variables used in the clustering are statistically significant (First and third at p < 0.001 and second at p = 0.009), which tells us that all are successful at classifying units into groups, since we rejected all H0s. Moreover we can look at F - values. Highest F value is in response 1 (Fat), which means the differences between the clusters are the highest for the variable Fat and the opposite for sugars –> lowest F value, lower differences between clusters.
#Pearson Chi2 test
chisq_results <- chisq.test(mydata$CategoryFactor, as.factor(mydata$ClusterK_Means))
chisq_results
##
## Pearson's Chi-squared test
##
## data: mydata$CategoryFactor and as.factor(mydata$ClusterK_Means)
## X-squared = 1.6254, df = 2, p-value = 0.4437
addmargins(chisq_results$observed)
##
## mydata$CategoryFactor 1 2 Sum
## Burgers 11 11 22
## Chicken 8 10 18
## Breakfast 11 22 33
## Sum 30 43 73
round(chisq_results$expected, 2)
##
## mydata$CategoryFactor 1 2
## Burgers 9.04 12.96
## Chicken 7.40 10.60
## Breakfast 13.56 19.44
round(chisq_results$res, 2)
##
## mydata$CategoryFactor 1 2
## Burgers 0.65 -0.54
## Chicken 0.22 -0.19
## Breakfast -0.70 0.58
Pearson’s Chi-squared test:
H0: There is no association between categorical variables.
H1: There is some association between categorical variables.
Since p-value is > 5% (0.4437), we cannot reject H0, meaning there is no association between the clusters of categorical variable CategoryFactor.
Classification of 77 menu item (73 after removing 4 outliers) from Burger king was based on three standardized variables. For hierarchical clustering, Ward’s algorithm was used, and based on the analysis of the dendrogram and the indices analyzing the increase in heterogeneity, it was decided to classify the menu item into two groups. The classification was further optimized using the K-Means cluster.
Group 1 contains 30 menu items, characterized by an above than average value of all cluster variables. The menu item have the highest mean of grams of fat, sugars and proteins. There are more than the expected number of burgers and chicken and less than the expected number of breakfast in the 1st group (𝛼 = 0,05). Group 2 contains 43 menu items, characterized by an above average value of all cluster variables. There are more than the expected number of breakfast and less than the expected number of burgers and chicken (𝛼 = 0,05).