#Required library
library(blob)
library(cluster)
library(corrplot)
## Warning: le package 'corrplot' a é– ï¿½ compil� avec la version R 4.1.3
## corrplot 0.92 loaded
library(datasets)
library(DBI)
library(dbplyr)
library(dendextend)
## Warning: le package 'dendextend' a é– ï¿½ compil� avec la version R 4.1.3
##
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attachement du package : 'dendextend'
## L'objet suivant est masqué depuis 'package:stats':
##
## cutree
library(dplyr)
##
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:dbplyr':
##
## ident, sql
## Les objets suivants sont masqués depuis 'package:stats':
##
## filter, lag
## Les objets suivants sont masqués depuis 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Warning: le package 'factoextra' a é– ï¿½ compil� avec la version R 4.1.3
## Le chargement a nécessité le package : ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(fmsb)
## Warning: le package 'fmsb' a é– ï¿½ compil� avec la version R 4.1.3
library(forcats)
library(formattable)
## Warning: le package 'formattable' a é– ï¿½ compil� avec la version R 4.1.3
library(GGally)
## Warning: le package 'GGally' a é– ï¿½ compil� avec la version R 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
library(graphics)
library(grDevices)
library(magrittr)
library(methods)
library(patchwork)
## Warning: le package 'patchwork' a é– ï¿½ compil� avec la version R 4.1.3
##
## Attachement du package : 'patchwork'
## L'objet suivant est masqué depuis 'package:formattable':
##
## area
library(plotly)
## Warning: le package 'plotly' a é– ï¿½ compil� avec la version R 4.1.3
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:formattable':
##
## style
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
library(plotrix)
library(plyr)
## Warning: le package 'plyr' a é– ï¿½ compil� avec la version R 4.1.3
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attachement du package : 'plyr'
## Les objets suivants sont masqués depuis 'package:plotly':
##
## arrange, mutate, rename, summarise
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(purrr)
##
## Attachement du package : 'purrr'
## L'objet suivant est masqué depuis 'package:plyr':
##
## compact
## L'objet suivant est masqué depuis 'package:magrittr':
##
## set_names
library(readr)
library(readxl)
library(reprex)
library(Rsymphony)
library(splitstackshape)
library(stats)
library(stringr)
library(tibble)
library(tidyr)
##
## Attachement du package : 'tidyr'
## L'objet suivant est masqué depuis 'package:magrittr':
##
## extract
library(tidyselect)
library(tidyverse)
library(utils)
#Import the dataset
library(readxl)
mcdonalds_it_20220221 <- read_excel("mcdonalds_it_20220221.xlsx")
data <- mcdonalds_it_20220221
View(data)
head(data)
## # A tibble: 6 x 15
## ID `web-scraper-order` `web-scraper-s~` name product_link `product_link-~`
## <dbl> <chr> <chr> <chr> <lgl> <chr>
## 1 1 1631284741-140 https://www.mcd~ Milk~ NA https://www.mcd~
## 2 2 1631285604-185 https://www.mcd~ Juni~ NA https://www.mcd~
## 3 3 1631284758-141 https://www.mcd~ Cono NA https://www.mcd~
## 4 4 1631284516-126 https://www.mcd~ Capp~ NA https://www.mcd~
## 5 5 1631285062-160 https://www.mcd~ Coca~ NA https://www.mcd~
## 6 6 1631283842-95 https://www.mcd~ Mini~ NA https://www.mcd~
## # ... with 9 more variables: Calories <dbl>, `Total Fats` <dbl>,
## # `Saturated Fats` <dbl>, `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## # `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <chr>
#Check dimension of the dataset
dim(data)
## [1] 124 15
#Removing columns we do not need (web scraper, web scraper-start-url, product_link)
data_clean <- data[ -c(2, 3, 5)]
#Removing missing values
data_clean_2 <- na.omit(data_clean)
#Converting characters in Rating variable into double
library(plyr)
data_clean_2$Rating <- revalue(data_clean_2$Rating,c("H"=2, "M"=1, "L"=0))
data_clean_2$Rating <- as.numeric(data_clean_2$Rating)
#Removing units of measure from numeric variables
Calories_1 <- as.numeric(as.character(substr(data_clean_2$Calories,1,4)))
TotalFats_1 <- as.numeric(as.character(substr(data_clean_2$`Total Fats`,1,4)))
SaturatedFats_1 <- as.numeric(as.character(substr(data_clean_2$`Saturated Fats`,1,4)))
TotalCarbs_1 <- as.numeric(as.character(substr(data_clean_2$`Total Carbs`,1,4)))
TotalSugars_1 <- as.numeric(as.character(substr(data_clean_2$`Total Sugars`,1,4)))
DietaryFiber_1 <- as.numeric(as.character(substr(data_clean_2$`Dietary Fiber`,1,4)))
Protein_1 <- as.numeric(as.character(substr(data_clean_2$Protein,1,4)))
Salt_1 <- as.numeric(as.character(substr(data_clean_2$Salt,1,4)))
data_clean_2$Calories <- Calories_1
data_clean_2$`Total Fats`<- TotalFats_1
data_clean_2$`Saturated Fats` <- SaturatedFats_1
data_clean_2$`Total Carbs`<-TotalCarbs_1
data_clean_2$`Total Sugars`<- TotalSugars_1
data_clean_2$`Dietary Fiber`<- DietaryFiber_1
data_clean_2$Protein <- Protein_1
data_clean_2$Salt <- Salt_1
#Synthesising the results of the cleaned dataset
summary(data_clean_2)
## ID name product_link-href Calories
## Min. : 1.00 Length:120 Length:120 Min. : 0.0
## 1st Qu.: 31.75 Class :character Class :character 1st Qu.: 93.5
## Median : 62.50 Mode :character Mode :character Median :234.5
## Mean : 62.12 Mean :241.4
## 3rd Qu.: 92.25 3rd Qu.:328.5
## Max. :123.00 Max. :744.0
## Total Fats Saturated Fats Total Carbs Total Sugars
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.70 1st Qu.: 0.675 1st Qu.:12.00 1st Qu.: 4.85
## Median : 8.20 Median : 3.200 Median :26.50 Median : 8.95
## Mean :10.69 Mean : 4.371 Mean :27.48 Mean :14.33
## 3rd Qu.:17.00 3rd Qu.: 6.325 3rd Qu.:42.00 3rd Qu.:22.00
## Max. :46.00 Max. :20.000 Max. :89.00 Max. :57.00
## Dietary Fiber Protein Salt Rating
## Min. :0.000 Min. : 0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.000 1st Qu.: 0.975 1st Qu.:0.0800 1st Qu.:1.000
## Median :1.000 Median : 5.000 Median :0.2900 Median :2.000
## Mean :1.277 Mean : 8.176 Mean :0.7586 Mean :1.483
## 3rd Qu.:2.100 3rd Qu.: 9.100 3rd Qu.:1.2000 3rd Qu.:2.000
## Max. :4.800 Max. :47.000 Max. :4.8000 Max. :2.000
#Check dimension of the cleaned dataset (data_clean_2)
dim(data_clean_2)
## [1] 120 12
#Creating "Category" variable to group all products into macro-areas
data_clean_2$Category = case_when (substring (data_clean_2$`product_link-href`,35,40)=='panini'~'Panini',
substring(data_clean_2$`product_link-href`,35,50)=='gelati-e-dessert'~'Gelati e Dessert',
substring(data_clean_2$`product_link-href`,35,40)=='mccafe'~'McCafe',
substring(data_clean_2$`product_link-href`,35,41)=='bevande'~'Bevande',
substring(data_clean_2$`product_link-href`,35,39)=='salse'~'Salse',
substring(data_clean_2$`product_link-href`,35,42)=='insalate'~'Insalate',
substring(data_clean_2$`product_link-href`,35,42)=='patatine'~'Patatine',
substring(data_clean_2$`product_link-href`,35,46)=='frescallegre'~'Frescallegre',
substring(data_clean_2$`product_link-href`,35,54)=='nuggets-e-sfiziosita'~'Nuggets e Sfiziosità ')
#Creating categorical variable for McCafe
data_clean_2$'Bevande McCafe' = case_when (substring (data_clean_2$`product_link-href`,42,58)=='caffetteria-calda'~'Caffetteria Calda',
substring(data_clean_2$`product_link-href`,42,58)=='succhi-e-spremute'~'Succhi e Spremute')
#Frequency table per Category with the percentages indicating the weight of each in the total
category_freq <- data_clean_2 %>% count('Category')
category_freq$perc <- (category_freq$freq)*100/nrow(data_clean_2)
category_freq
## Category freq perc
## 1 Bevande 9 7.500000
## 2 Frescallegre 3 2.500000
## 3 Gelati e Dessert 20 16.666667
## 4 Insalate 4 3.333333
## 5 McCafe 48 40.000000
## 6 Nuggets e Sfiziosità 4 3.333333
## 7 Panini 23 19.166667
## 8 Patatine 4 3.333333
## 9 Salse 5 4.166667
#Pie chart Category to understand the distribution of all categories
pie(table(data_clean_2$Category))
#Categories with the highest and lowest number of items
paste("The category with the highest number of products is:", category_freq$Category[category_freq$freq == max(category_freq$freq) ], "with:", max(category_freq$freq), "items. The category with the lowest number of product is:", category_freq$Category[category_freq$freq == min(category_freq$freq)], "with:", min(category_freq$freq), "items")
## [1] "The category with the highest number of products is: McCafe with: 48 items. The category with the lowest number of product is: Frescallegre with: 3 items"
#Exploring which are the categories in McCafe
McCafe <- data_clean_2 %>% filter(Category == 'McCafe')
McCafe <- McCafe[,c(2,13)]
McCafe
## # A tibble: 48 x 2
## name Category
## <chr> <chr>
## 1 Cappuccino grande McCafe
## 2 Mini Donut cacao McCafe
## 3 Cremolosa McCafe
## 4 Margherite con confettura McCafe
## 5 Cremolosa al caffè McCafe
## 6 Treccia alla crema McCafe
## 7 Mini Muffin cioccolato bianco McCafe
## 8 Tè caldo e infusi McCafe
## 9 Succo di frutta pera McCafe
## 10 Muffin cioccolato McCafe
## # ... with 38 more rows
#Average calories in the whole menu
library(tidyverse)
mean_cal = round(mean(data_clean_2$Calories),2)
paste("The average calories in the whole menu is:", mean_cal)
## [1] "The average calories in the whole menu is: 241.42"
#Average Calories in Each Category
m_cal=aggregate(data_clean_2$Calories,by=list(type=data_clean_2$Category),mean)
data.frame("Category"=m_cal$type,"Average calories"=m_cal$x)
## Category Average.calories
## 1 Bevande 92.11111
## 2 Frescallegre 29.66667
## 3 Gelati e Dessert 275.60000
## 4 Insalate 160.75000
## 5 McCafe 186.66667
## 6 Nuggets e Sfiziosità 186.50000
## 7 Panini 454.39130
## 8 Patatine 343.75000
## 9 Salse 73.00000
# Saturated Fats Boxplot by Category
ggplot(data_clean_2, aes(x=Category, y= data_clean_2$'Saturated Fats')) +
geom_boxplot(aes(fill=Category), outlier.colour = "red") +
coord_flip() +
labs(title= "Saturated Fats Boxplot by Category") +
xlab("") +
ylab("Saturated Fats")
## Warning: Use of `data_clean_2$"Saturated Fats"` is discouraged. Use `Saturated
## Fats` instead.
#Correlation matrix to determine the strength and the direction of relationship between the variables.
corrplot(cor(data_clean_2 %>% select_if(is.numeric), method = "pearson"), method = "circle")
#In this matrix we can also understand what the Rating (0,1,2) is due to: it is strongly negatively correlated with Saturated Fats, very negatively correlated with Calories, Total Fats and Total Carbs, but poorly correlated with Dietary Fiber, Protein and Salt
#Let's get the total calories in the whole menu then see how much do "Beverages" and "Coffee" and "Spremute" contribute
menu_tot_cal = sum(data_clean_2$Calories)
paste("The total calories in the whole menu is:", menu_tot_cal)
## [1] "The total calories in the whole menu is: 28970"
#Now let's see by how much "Beverage" contribute to the overall caloric intake
Bevande_total_cal = data_clean_2 %>%
filter(Category == "Bevande") %>%
select(Calories) %>%
slice(1:20) %>%
sum()
Bevande_percent = round((Bevande_total_cal / menu_tot_cal * 100),2)
paste("Bevarages contribute", Bevande_percent, "% to the total calories")
## [1] "Bevarages contribute 2.86 % to the total calories"
#Now let's see by how much "Coffee" contribute to the overall caloric intake
Coffee_total_cal = data_clean_2 %>%
filter(data_clean_2$'Bevande McCafe' == "Caffetteria Calda") %>%
select("Calories") %>%
slice(9:n()) %>%
sum()
Coffee_percent = round((Coffee_total_cal / menu_tot_cal * 100),2)
paste("Coffee contributes", Coffee_percent, "% to the total calories")
## [1] "Coffee contributes 2.14 % to the total calories"
#Now let's see by how much "Spremute" contribute to the overall caloric intake
succhi_e_spremute_total_cal = data_clean_2%>%
filter(data_clean_2$`Bevande McCafe` == "Succhi e Spremute") %>%
select(Calories) %>%
slice(1:20) %>%
sum()
Spremute_percent = round((succhi_e_spremute_total_cal / menu_tot_cal * 100),2)
paste("Spremute contributes", Spremute_percent, "% to the total calories")
## [1] "Spremute contributes 1.58 % to the total calories"
#Create 2 DataFrame: one for grilled chicken products and one for crispy chicken products
Grilled <- data_clean_2 %>% filter(name == "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano" | name == "Insalata con petto di pollo alla piastra e Parmigiano Reggiano" | name == "Junior Chicken" | name == "Chicken Country")
df_Grilled <- Grilled[,c(2,4,12)]
df_Grilled
## # A tibble: 4 x 3
## name Calories Rating
## <chr> <dbl> <dbl>
## 1 Junior Chicken 206 2
## 2 Insalata con petto di pollo alla piastra e Parmigiano Reggiano 201 2
## 3 McWrap® con petto di pollo alla piastra e Parmigiano Reggiano 426 2
## 4 Chicken Country 447 2
Crispy <- data_clean_2 %>% filter( name == "Chickenburger" | name == "McChicken® Originale" |name == "Double Chicken BBQ" | name == "My Selection Chicken Pepper" | ID == 22 | name == "Insalata con petto di pollo croccante e Parmigiano Reggiano")
df_Crispy <- Crispy[,c(2,4,12)]
df_Crispy
## # A tibble: 6 x 3
## name Calories Rating
## <chr> <dbl> <dbl>
## 1 "McWrap® con petto di pollo croccante \r\ne Parmigiano Reggia~ 535 1
## 2 "Insalata con petto di pollo croccante e Parmigiano Reggiano" 313 2
## 3 "Chickenburger" 323 2
## 4 "McChicken® Originale" 446 2
## 5 "Double Chicken BBQ" 558 1
## 6 "My Selection Chicken Pepper" 636 1
#How does nutritional value change?
summary(df_Grilled)
## name Calories Rating
## Length:4 Min. :201.0 Min. :2
## Class :character 1st Qu.:204.8 1st Qu.:2
## Mode :character Median :316.0 Median :2
## Mean :320.0 Mean :2
## 3rd Qu.:431.2 3rd Qu.:2
## Max. :447.0 Max. :2
summary(df_Crispy)
## name Calories Rating
## Length:6 Min. :313.0 Min. :1.0
## Class :character 1st Qu.:353.8 1st Qu.:1.0
## Mode :character Median :490.5 Median :1.5
## Mean :468.5 Mean :1.5
## 3rd Qu.:552.2 3rd Qu.:2.0
## Max. :636.0 Max. :2.0
#Average calories are higher for products with Crispy Chicken (468,5 vs 320) and the mean of Rating is lower (1,5 vs 2)
#Comparison between Salad and McWrap (Grilled and Crispy)
#These are the only products we have in both Grilled and Crispy versions, so we can compare them directly
SW_Grilled = Grilled %>% filter(name == "Insalata con petto di pollo alla piastra e Parmigiano Reggiano" | name == "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano")
SW_Crispy = Crispy %>% filter(name == "Insalata con petto di pollo croccante e Parmigiano Reggiano" | ID == 22)
SW_Grilled
## # A tibble: 2 x 14
## ID name `product_link-~` Calories `Total Fats` `Saturated Fats`
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 31 Insalata con pe~ https://www.mcd~ 201 8.2 4.6
## 2 77 McWrap® con pet~ https://www.mcd~ 426 17 6
## # ... with 8 more variables: `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## # `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <dbl>,
## # Category <chr>, `Bevande McCafe` <chr>
SW_Crispy
## # A tibble: 2 x 14
## ID name `product_link-~` Calories `Total Fats` `Saturated Fats`
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 22 "McWrap® con pe~ https://www.mcd~ 535 25 6.7
## 2 25 "Insalata con p~ https://www.mcd~ 313 17 5.3
## # ... with 8 more variables: `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## # `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <dbl>,
## # Category <chr>, `Bevande McCafe` <chr>
#Merge the two dataframe
SW_merged <- rbind(SW_Grilled,SW_Crispy)
SW_merged <- SW_merged[,c(2,4)]
SW_merged
## # A tibble: 4 x 2
## name Calories
## <chr> <dbl>
## 1 "Insalata con petto di pollo alla piastra e Parmigiano Reggiano" 201
## 2 "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano" 426
## 3 "McWrap® con petto di pollo croccante \r\ne Parmigiano Reggiano" 535
## 4 "Insalata con petto di pollo croccante e Parmigiano Reggiano" 313
#Bar Plot with grilled and cripsy chicken compared
SW_merged %>%
ggplot(aes(x= name, y= `Calories`)) +
geom_bar(stat="identity", alpha = .5) +
coord_flip() +
labs(title = "Calories in Grilled and Crispy products", fill = "Category") +
xlab("") +
ylab("Calories")
#As we can see in the bar plot, both products contain more calories in the Crispy version
#With regard to the data available to us, we have chosen to focus on Saturated Fat and Salt, in order to construct a new Rating that indicates the product’s risk for cholesterol
Colest <- subset(data_clean_2, select=c(name, `Saturated Fats`, Salt))
# Standardize quantitative variables
Colest$SaturatedFats_stz = scale(Colest$`Saturated Fats`)
Colest$Salt_stz = scale(Colest$Salt)
# Check if means are equal to 0
summarise(Colest,
avg_SaturatedFats_stz = mean(SaturatedFats_stz),
avg_Salt_stz = mean(Salt_stz))
## avg_SaturatedFats_stz avg_Salt_stz
## 1 -4.919296e-17 -4.278352e-17
# Check if standard deviations are equal to 1
summarise(Colest,
sd_SaturatedFats_stz = sd(SaturatedFats_stz),
sd_Salt_stz = sd(Salt_stz))
## sd_SaturatedFats_stz sd_Salt_stz
## 1 1 1
# Choose the number of cluster based on Elbow Methods for k-means Algorithm:
Colest %>%
select(SaturatedFats_stz, Salt_stz) %>%
fviz_nbclust(FUN = kmeans, method = 'wss')
#The number of clusters is 3
# Define a seed:
# Setting random seed allows to initialize a pseudorandom number generator reproducing the same output
set.seed(3)
# Select Standardized columns
Colest_stz = Colest %>%
select(SaturatedFats_stz, Salt_stz)
# Performing Cluster Analysis with Kmeans algorithm ("iter.max" used to reduce the computational time) #Evaluate 20 different starting points and choose the best one
clusterK = kmeans(Colest_stz, centers = 3, nstart=20, iter.max=100)
# Plot k-means
clusterK %>%
fviz_cluster(geom = 'point', data = Colest_stz) +
ggtitle("k-means clustering: k = 3") +
theme(legend.position = 'bottom',
legend.title = element_blank())
# Print Within Deviance
print(paste("Within Deviance:", round(sum(clusterK$withinss), 1)))
## [1] "Within Deviance: 74.5"
# Plot Silhouette Coefficient
plot(silhouette(clusterK$cluster, dist(Colest_stz)))
Colest$Rating <- case_when(
clusterK$cluster =='1' ~ "Low Risk",
clusterK$cluster=='2' ~ "High Risk",
clusterK$cluster=='3' ~ "Medium Risk")
View(Colest)
#Now in the Colest dataframe is possible to see if each product belongs to low, medium or high risk for cholesterol
#Divide the dataset into training set and test set. The percentage of sizes of training and test is 70% to 30%.
set.seed(1234)
index=sample(2,nrow(data_clean_2),replace=TRUE,prob=c(0.7,0.3))
train = data_clean_2[index==1,c(2,4:12)]
test = data_clean_2[index==2,c(2,4:12)]
head(train)
## # A tibble: 6 x 10
## name Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Milkshake~ 268 4.9 3.3 49 45
## 2 Junior Ch~ 206 2.8 0.6 30 6.6
## 3 Cono 140 3.6 2.1 24 17
## 4 Cappuccin~ 143 9 6 8.8 8.8
## 5 Mini Donu~ 80 5 2.8 7.7 2.4
## 6 Gluten Fr~ 457 25 13 33 4.2
## # ... with 4 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## # Rating <dbl>
head(test)
## # A tibble: 6 x 10
## name Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Coca Cola® 168 0 0 42 42
## 2 Ketchup 33 0.1 0 7.4 6
## 3 Sundae Fr~ 273 3.9 2.7 55 50
## 4 Tè caldo ~ 0 0 0 0 0
## 5 McFlurry®~ 453 17 8.2 66 54
## 6 McCrunchy~ 231 6.6 2.2 35 12
## # ... with 4 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## # Rating <dbl>
#Standardization of the training and test set
stdtrain = as.data.frame(scale(train[,-c(1,10)]))
stdtest = as.data.frame(scale(test[,-c(1,10)]))
summary(stdtrain[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.39299 -0.84942 -0.07455 0.00000 0.50371 2.90928
mean(stdtrain[,1])
## [1] -6.307026e-17
sd(stdtrain[,1])
## [1] 1
#Results of the KNN model
library(class)
knn = knn(stdtrain, stdtest, train$Rating, k=3)
test$predictions = knn
test$misclassified = test$Rating != test$predictions
head(test)
## # A tibble: 6 x 12
## name Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Coca Cola® 168 0 0 42 42
## 2 Ketchup 33 0.1 0 7.4 6
## 3 Sundae Fr~ 273 3.9 2.7 55 50
## 4 Tè caldo ~ 0 0 0 0 0
## 5 McFlurry®~ 453 17 8.2 66 54
## 6 McCrunchy~ 231 6.6 2.2 35 12
## # ... with 6 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## # Rating <dbl>, predictions <fct>, misclassified <lgl>
#Confusion matrix
t = table(knn,test$Rating)
t
##
## knn 0 1 2
## 0 1 0 0
## 1 0 9 2
## 2 0 1 14
#Calculate accuracy
nrow(test)
## [1] 27
accuracy = sum(diag(t))/nrow(test)
error_rate = 1 - accuracy
cat("Accuracy = ", accuracy, ", error rate = ", error_rate)
## Accuracy = 0.8888889 , error rate = 0.1111111
It turns out that our KNN model is quite accurate! So we can go on to see which the best k is.
knn_i = NULL
error_rate = NULL
for(i in 1:30)
{
knn_i = knn(stdtrain, stdtest, train$Rating, k=i)
error_rate[i] = mean(knn_i != test$Rating)
}
rm(knn_i)
best_k= which.min(error_rate)
cat("Best k =", best_k)
## Best k = 3
We’re already using the best k! So we don’t need to redo the KNN.
Now let’s use this model to do some interesting things. There are some food that people believe are healthy, but actually are not that healthy. We found several food like this on the website of Carrefour, and we wonder how these food would be classified if it were a McDonald’s food. Then we can see whether they’re healthy or not.
#Input data of the "healthy" good
healthy_test <- data.frame(
"name"=c("Smoothie innocent 75cl","Smoothie innocent no sugar added 75cl","Smoothie innocent 33cl","Cereal & fruit biscuit bio nutri+ 218g","Cereal & fruit biscuit bio nutri+ 55g","Yogurt Carrefour 125g","Yogurt Carrefour 250g","Salad Roma Sodebo (with oil sauce and a cookie) 320g","Penne Chicken salad with Caesar sauce 250g","Salad piémontaise bonduelle 320g"),
"Calories"=c(410.3,397.5,138,844,211,110,220,682,440,419),
"Total Fats"=c(0,0.8,0,15.3,3.8,3.6,7.2,35.2,25,30.4),
"Saturated Fats"=c(0,0.8,0,2.4,0.6,2.4,4.8,4.8,5.5,3.2),
"Total Carbs"=c(90,89.3,37.3,154.8,38.7,15,30,64,35,20.8),
"Total Sugar"=c(75,81.8,33,85,21.3,15,30,16.6,33,3.2),
"Dietary Fiber"=c(4.5,9.8,0,13.1,3.3,6,1.2,8.6,4.3,4.8),
"Protein"=c(1.5,3.8,0,15.5,3.9,4.1,8.2,23,16.5,12.8),
"Salt"=c(0,0,0,1.5,0.4,0.1,0.2,2.1,3.3,2.9))
healthy_test
## name Calories Total.Fats
## 1 Smoothie innocent 75cl 410.3 0.0
## 2 Smoothie innocent no sugar added 75cl 397.5 0.8
## 3 Smoothie innocent 33cl 138.0 0.0
## 4 Cereal & fruit biscuit bio nutri+ 218g 844.0 15.3
## 5 Cereal & fruit biscuit bio nutri+ 55g 211.0 3.8
## 6 Yogurt Carrefour 125g 110.0 3.6
## 7 Yogurt Carrefour 250g 220.0 7.2
## 8 Salad Roma Sodebo (with oil sauce and a cookie) 320g 682.0 35.2
## 9 Penne Chicken salad with Caesar sauce 250g 440.0 25.0
## 10 Salad piémontaise bonduelle 320g 419.0 30.4
## Saturated.Fats Total.Carbs Total.Sugar Dietary.Fiber Protein Salt
## 1 0.0 90.0 75.0 4.5 1.5 0.0
## 2 0.8 89.3 81.8 9.8 3.8 0.0
## 3 0.0 37.3 33.0 0.0 0.0 0.0
## 4 2.4 154.8 85.0 13.1 15.5 1.5
## 5 0.6 38.7 21.3 3.3 3.9 0.4
## 6 2.4 15.0 15.0 6.0 4.1 0.1
## 7 4.8 30.0 30.0 1.2 8.2 0.2
## 8 4.8 64.0 16.6 8.6 23.0 2.1
## 9 5.5 35.0 33.0 4.3 16.5 3.3
## 10 3.2 20.8 3.2 4.8 12.8 2.9
As we can see above, we mainly focus on 4 types of perceived healthy food: smoothie, cereal biscuit, fruit yogurt and salad.
#Standardization
stdhealthytest = as.data.frame(scale(healthy_test[,-c(1,10)]))
stdmcdonalds = as.data.frame(scale(data_clean_2[,-c(1,2,3,12,13,14,15)]))
summary(stdhealthytest[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.17839 -0.73944 0.07108 0.00000 0.20224 1.94210
summary(stdmcdonalds[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.41824 -0.86896 -0.04063 0.00000 0.51158 2.95250
Now let’s use our KNN model to find out whether these food are really healthy or not (using the standard of McDonald’s.)
#KNN result
knn1 = knn(stdmcdonalds, stdhealthytest, data_clean_2$Rating, k=3)
healthy_test$predictions = knn1
healthy_test[,c(1,10)]
## name predictions
## 1 Smoothie innocent 75cl 2
## 2 Smoothie innocent no sugar added 75cl 2
## 3 Smoothie innocent 33cl 2
## 4 Cereal & fruit biscuit bio nutri+ 218g 1
## 5 Cereal & fruit biscuit bio nutri+ 55g 2
## 6 Yogurt Carrefour 125g 2
## 7 Yogurt Carrefour 250g 1
## 8 Salad Roma Sodebo (with oil sauce and a cookie) 320g 1
## 9 Penne Chicken salad with Caesar sauce 250g 1
## 10 Salad piémontaise bonduelle 320g 2
Now we can see some of them are not rated highly, even by McDonald’s standard (which is not high!). The results are not the same every time, and sometimes the KNN model even classifies Smoothie 75cl as 1 (medium). So, when drinking sugar added smoothie and fruit yoghurt, or eating cereal biscuit (which claims to be bio and nutri+) and salad with sauce and biscuit, be careful! Don’t eat too much because they’re not as healthy as you think!
#How to create a menu including only healthy food?
#menu = panino + bevanda + salsa + patatine + nuggets e sfiziosità + gelati e dessert
#healthy = rating 2
healthy_panino <- filter(data_clean_2, Category == 'Panini' & Rating == '2')
healthy_bevanda <- filter(data_clean_2, Category == 'Bevande' & Rating == '2')
healthy_salsa <- filter(data_clean_2, Category == 'Salse' & Rating == '2')
healthy_Nuggets_e_Sfiziosità <- filter(data_clean_2, Category == 'Nuggets e Sfiziosità ' & Rating == '2')
healthy_patatine <- filter(data_clean_2, Category == 'Patatine' & Rating == '2')
healthy_gelati_e_dessert <- filter(data_clean_2, Category == 'Gelati e Dessert' & Rating == '2')
panino_HM <- sample(healthy_panino$name, 1, replace = FALSE, prob = NULL)
bevanda_HM <- sample(healthy_bevanda$name, 1, replace = FALSE, prob = NULL)
salsa_HM <- sample(healthy_salsa$name, 1, replace = FALSE, prob = NULL)
bevanda_HM <- sample(healthy_bevanda$name, 1, replace = FALSE, prob = NULL)
nuggets_e_sfiziosità _HM <- sample(healthy_Nuggets_e_Sfiziosità $name, 1, replace = FALSE, prob = NULL)
patatine_HM <- sample(healthy_patatine$name, 1, replace = FALSE, prob = NULL)
gelati_e_dessert_HM <- sample(healthy_gelati_e_dessert$name, 1, replace = FALSE, prob = NULL)
healthy_menu <- data.frame(panino_HM, bevanda_HM, salsa_HM, nuggets_e_sfiziosità _HM, patatine_HM, gelati_e_dessert_HM)
View(healthy_menu)
#This chunk of code rendomly creates a McDonal's menu containing a burger, a drink, a sauce, chips, nuggets and snacks, and ice cream and desserts, each with a rating of 2 (High)
#Which factors affects the nutritional values?
#We have decided to run a regression to identify which among the factors affects the variable calories (that we have takesn as nutritional value)
library(ISLR2)
numeric_data <- data_clean_2[ -c(1, 2,3, 12, 13, 14, 15)]
standardized_numeric_data <- numeric_data
# Standardize quantitative variables
standardized_numeric_data$Calories_std = scale(numeric_data$Calories)
standardized_numeric_data$Total_Fats_std = scale(numeric_data$`Total Fats`)
standardized_numeric_data$Saturated_Fats_std = scale(numeric_data$`Saturated Fats`)
standardized_numeric_data$Total_Carbs_std = scale(numeric_data$`Total Carbs`)
standardized_numeric_data$Total_Sugars_std = scale(numeric_data$`Total Sugars`)
standardized_numeric_data$Dietary_Fiber_std = scale(numeric_data$`Dietary Fiber`)
standardized_numeric_data$Protein_std = scale(numeric_data$`Protein`)
standardized_numeric_data$Salt_std = scale(numeric_data$`Salt`)
# Check if means are equal to 0
summarise(standardized_numeric_data
, avg_Calories_std = mean(Calories_std)
, avg_Total_Fats_std = mean(Total_Fats_std)
, avg_Saturated_Fats_std = mean(Saturated_Fats_std)
, avg_Total_Carbs_std = mean(Total_Carbs_std)
, avg_Total_Sugars_std = mean(Total_Sugars_std)
, avg_Dietary_Fiber_std = mean(Dietary_Fiber_std)
, avg_Protein_std = mean(Protein_std)
, avg_Salt_std = mean(Salt_std))
## avg_Calories_std avg_Total_Fats_std avg_Saturated_Fats_std
## 1 6.361121e-17 6.938894e-17 -4.919296e-17
## avg_Total_Carbs_std avg_Total_Sugars_std avg_Dietary_Fiber_std
## 1 2.810659e-17 1.516076e-17 4.44035e-17
## avg_Protein_std avg_Salt_std
## 1 1.181893e-17 -4.278352e-17
# Check if standar deviations are equal to 1
summarise(standardized_numeric_data
, sd_Calories_std = sd(Calories_std)
, sd_Total_Fats_std = sd(Total_Fats_std)
, sd_Saturated_Fats_std = sd(Saturated_Fats_std)
, sd_Total_Carbs_std = sd(Total_Carbs_std)
, sd_Total_Sugars_std = sd(Total_Sugars_std)
, sd_Dietary_Fiber_std = sd(Dietary_Fiber_std)
, sd_Protein_std = sd(Protein_std)
, sd_Salt_std = sd(Salt_std))
## sd_Calories_std sd_Total_Fats_std sd_Saturated_Fats_std sd_Total_Carbs_std
## 1 1 1 1 1
## sd_Total_Sugars_std sd_Dietary_Fiber_std sd_Protein_std sd_Salt_std
## 1 1 1 1 1
# Remove unstandardized values
standardized_numeric_data <- standardized_numeric_data[ -c(1, 2, 3, 4, 5, 6, 7, 8)]
# 9.(a) Produce a scatterplot matrix which includes all of the variables in the data set.
pairs(standardized_numeric_data, lower.panel = NULL)
#produce a boxplot for each variable and identify outliers
boxplot(standardized_numeric_data$Calories_std,
ylab = "Calories",
main = "Boxplot with Calories",
)
boxplot(standardized_numeric_data$Total_Fats_std,
ylab = "Total Fats",
main = "Boxplot with Total Fats",
)
boxplot(standardized_numeric_data$Saturated_Fats_std,
ylab = "Saturated Fats",
main = "Boxplot with Saturated Fats",
)
boxplot(standardized_numeric_data$Total_Carbs_std,
ylab = "Total Carbs",
main = "Boxplot with Total Carbs",
)
boxplot(standardized_numeric_data$Total_Sugars_std,
ylab = "Total Sugars",
main = "Boxplot with Total Sugars",
)
boxplot(standardized_numeric_data$Dietary_Fiber_std,
ylab = "Dietary Fiber",
main = "Boxplot with Dietary Fiber",
)
boxplot(standardized_numeric_data$Protein_std,
ylab = "Protein",
main = "Boxplot with Protein",
)
boxplot(standardized_numeric_data$Salt_std,
ylab = "Salt",
main = "Boxplot with Salt",
)
library(dplyr)
#removing outliers for Calories
cal_out <- boxplot.stats(standardized_numeric_data$Calories_std)$out
cal_out_ind <- which(standardized_numeric_data$Calories_std %in% c(cal_out))
cal_out_ind
## [1] 15 76 109
#three outliers have been identified in rows 15, 76 and 109
standardized_numeric_data <- standardized_numeric_data[-c(15, 76, 109),]
#the three rows have been removed from the standardized_numeric_data df
tot_fats_out <- boxplot.stats(standardized_numeric_data$Total_Fats_std)$out
tot_fats_out_ind <- which(standardized_numeric_data$Total_Fats_std %in% c(tot_fats_out))
tot_fats_out_ind
## integer(0)
#total fats seems to present no outliers
sat_fats_out <- boxplot.stats(standardized_numeric_data$Saturated_Fats_std)$out
sat_fats_out_ind <- which(standardized_numeric_data$Saturated_Fats_std %in% c(sat_fats_out))
sat_fats_out_ind
## integer(0)
#saturated fats seems to present no outliers
tot_carbs_out <- boxplot.stats(standardized_numeric_data$Total_Carbs_std)$tot_carbs_out
tot_carbs_out_ind <- which(standardized_numeric_data$Total_Carbs_std %in% c(tot_carbs_out))
tot_carbs_out_ind
## integer(0)
#total carbs seems to present no outliers
tot_sugars_out <- boxplot.stats(standardized_numeric_data$Total_Sugars_std)$tot_sugars_out
tot_sugars_out_ind <- which(standardized_numeric_data$Total_Sugars_std %in% c(tot_sugars_out))
tot_sugars_out_ind
## integer(0)
#total sugars seems to present no outliers
diet_fiber_out <- boxplot.stats(standardized_numeric_data$Dietary_Fiber_std)$diet_fiber_out
diet_fiber_out_ind <- which(standardized_numeric_data$Dietary_Fiber_std %in% c(diet_fiber_out))
diet_fiber_out_ind
## integer(0)
#dietary fiber seems to present no outliers
diet_fiber_out <- boxplot.stats(standardized_numeric_data$Dietary_Fiber_std)$diet_fiber_out
diet_fiber_out_ind <- which(standardized_numeric_data$Dietary_Fiber_std %in% c(diet_fiber_out))
diet_fiber_out_ind
## integer(0)
#dietary fiber seems to present no outliers
protein_out <- boxplot.stats(standardized_numeric_data$Protein_std)$protein_out
protein_out_ind <- which(standardized_numeric_data$Protein_std %in% c(protein_out))
protein_out_ind
## integer(0)
#protein seems to present no outliers
salt_out <- boxplot.stats(standardized_numeric_data$Salt_std)$salt_out
salt_out_ind <- which(standardized_numeric_data$Protein_std %in% c(protein_out))
protein_out_ind
## integer(0)
#salt seems to present no outliers
df1_cor = cor(standardized_numeric_data) #calculate correlation between all the variables
corrplot(df1_cor, type='upper', diag = FALSE)
rm(df1_cor)
#there seems to be high positive correlation between total and saturated fats, between protein and salt. For what concerns the DV Calories, it seems to be positively correlated with all the IV (low correlation with Total Sugars)
set.seed(3)
lm.fit = lm(Calories_std~Total_Fats_std+Saturated_Fats_std+Total_Carbs_std+Total_Sugars_std+Dietary_Fiber_std+Protein_std+Salt_std, data=standardized_numeric_data) #linear regression model including all the IVs
summary(lm.fit)
##
## Call:
## lm(formula = Calories_std ~ Total_Fats_std + Saturated_Fats_std +
## Total_Carbs_std + Total_Sugars_std + Dietary_Fiber_std +
## Protein_std + Salt_std, data = standardized_numeric_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.063565 -0.010458 -0.001331 0.009966 0.095466
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0007969 0.0017432 0.457 0.648463
## Total_Fats_std 0.5320369 0.0051273 103.766 < 2e-16 ***
## Saturated_Fats_std -0.0018396 0.0040470 -0.455 0.650320
## Total_Carbs_std 0.4183789 0.0040763 102.636 < 2e-16 ***
## Total_Sugars_std -0.0050930 0.0038386 -1.327 0.187348
## Dietary_Fiber_std 0.0105058 0.0026311 3.993 0.000119 ***
## Protein_std 0.2289911 0.0050844 45.038 < 2e-16 ***
## Salt_std 0.0001195 0.0046882 0.025 0.979708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01864 on 109 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9996
## F-statistic: 3.821e+04 on 7 and 109 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm.fit)
#checking for multicollinearity
car::vif(lm.fit)
## Total_Fats_std Saturated_Fats_std Total_Carbs_std Total_Sugars_std
## 6.463850 3.979611 5.581175 5.015693
## Dietary_Fiber_std Protein_std Salt_std
## 2.229444 5.895822 6.494178
#building a model without Total_Fats_std (high VIF, highly correlated with Saturated fats) and Salt_std (high VIF, highly correlated with Protein)
lm.fit2 = lm(Calories_std~Saturated_Fats_std+Total_Carbs_std+Total_Sugars_std+Dietary_Fiber_std+Protein_std, data=standardized_numeric_data) #linear regression model
summary(lm.fit2)
##
## Call:
## lm(formula = Calories_std ~ Saturated_Fats_std + Total_Carbs_std +
## Total_Sugars_std + Dietary_Fiber_std + Protein_std, data = standardized_numeric_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44523 -0.08157 -0.02424 0.07466 0.81765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0004543 0.0172279 -0.026 0.979011
## Saturated_Fats_std 0.3304249 0.0240379 13.746 < 2e-16 ***
## Total_Carbs_std 0.5348011 0.0358468 14.919 < 2e-16 ***
## Total_Sugars_std -0.1180580 0.0326605 -3.615 0.000454 ***
## Dietary_Fiber_std 0.0849339 0.0249466 3.405 0.000923 ***
## Protein_std 0.3215030 0.0298336 10.777 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1846 on 111 degrees of freedom
## Multiple R-squared: 0.9593, Adjusted R-squared: 0.9575
## F-statistic: 523.8 on 5 and 111 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm.fit2)
#checking for multicollinearity
car::vif(lm.fit2)
## Saturated_Fats_std Total_Carbs_std Total_Sugars_std Dietary_Fiber_std
## 1.432278 4.402877 3.704184 2.044527
## Protein_std
## 2.070755
#no multicollinearity detected, therefore the second model is selected
#the F statistics is high (523,8), therefore the null hypothesis of homogeneity of variance between the predictors is rejected (also the p-value is statistically significant alpha≈0)
#residuals are roughly centered around zero (1Q=-0.082, median=-0.02, 3Q=0,074), therefore the model fits the assumption of heteroscedasticity
#the predictors are able to explain 96% of the variance in Calories (R-square=0.96)
#Saturated fats is significantly (p-value≈0) and positively correlated with Calories
#Total carbs is significantly (p-value≈0) and positively correlated with Calories
#Total sugars is significantly (p-value=0.0005) and negatively correlated with Calories
#Dietary fiber is significantly, although slightly less than other predictors (p-value=0,001), and positively correlated with Calories
#Protein is significantly (p-value≈0) and positively correlated with Calories
#Principal Component Analysis
# The objectives of this analysis are to:
# 1. Identify groups of similar products in McDonald's menu
# 2. determine which variables make one group different from another
library(ggplot2)
library(FactoMineR)
## Warning: le package 'FactoMineR' a é– ï¿½ compil� avec la version R 4.1.3
library(factoextra)
library(corrplot)
library(PerformanceAnalytics)
## Warning: le package 'PerformanceAnalytics' a é– ï¿½ compil� avec la version R
## 4.1.3
## Le chargement a nécessité le package : xts
## Warning: le package 'xts' a é– ï¿½ compil� avec la version R 4.1.3
## Le chargement a nécessité le package : zoo
## Warning: le package 'zoo' a é– ï¿½ compil� avec la version R 4.1.3
##
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
##
## as.Date, as.Date.numeric
##
## Attachement du package : 'xts'
## Les objets suivants sont masqués depuis 'package:dplyr':
##
## first, last
##
## Attachement du package : 'PerformanceAnalytics'
## L'objet suivant est masqué depuis 'package:graphics':
##
## legend
library(readxl)
library(dplyr)
# plot distributions and correlations for each pair of variables
chart.Correlation(numeric_data, histogram=TRUE, pch=19)
#as observed in the regression analysis, there is a strong positive correlation between Calories and Total Fats, Calories and saturated fats, total fats and saturated fats, and salt and protein. Negative correlations only involve total sugars (with dietary fiber, protein, and salt), neverthelss is low (highest negative correlation is between salt and sugar, with a correlation coeff. of -0.28)
#Calculating the principal components (data is standardized and normalized)
results.pca = prcomp(numeric_data, scale=TRUE)
#scree plot
fviz_eig(results.pca, addlabels = TRUE, ylim = c(0, 70))
# Print cumulative explained variance
round(get_eigenvalue(results.pca), 4)#[0:5,]
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 4.8390 60.4873 60.4873
## Dim.2 1.6724 20.9052 81.3925
## Dim.3 0.6647 8.3088 89.7013
## Dim.4 0.5340 6.6755 96.3768
## Dim.5 0.1374 1.7169 98.0938
## Dim.6 0.0866 1.0822 99.1760
## Dim.7 0.0657 0.8211 99.9971
## Dim.8 0.0002 0.0029 100.0000
# Evaluate eigenvalues
results.pca$sdev^2
## [1] 4.838982168 1.672416522 0.664706280 0.534040545 0.137354609 0.086577999
## [7] 0.065690693 0.000231184
# Looking at the eigenvalues, the first two dimensions contain 81.4% of the total variance. This percentage is acceptable to keep the first two main axes for the rest of our analysis.
##### Interpretation Phase ###########################
# Change of the cartesian system (eigenvectors in R point in the negative direction by default)
results.pca$rotation=-results.pca$rotation
results.pca$x=-results.pca$x
# Graph of variables (cos2 = the quality of the variables on the factor map, calculated as the squared coordinates)
fviz_pca_var(results.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
#The factor map shows that the vertical axis almost coincides with the amount of sugar contained in the product (products high in sugar are low in the graph). Total Fats almost lies on the horizontal axis, although Calories, a little bit lower than Total Fats with regards to the horizontal axis, have a higher quality. Since the regression has shown that Total Fats are highly correlated with Calories, we can define this axis as Nutritional Value.
var = get_pca_var(results.pca)
corrplot(var$cos2, is.corr=FALSE)
#the same results as the factor map are displayed
# Interpretation of the Principal Components: loadings
results.pca$rotation[, 1:2]
## PC1 PC2
## Calories 0.44466763 -0.13311501
## Total Fats 0.42254669 0.04568254
## Saturated Fats 0.36245948 -0.10458761
## Total Carbs 0.30316436 -0.49947828
## Total Sugars 0.01836078 -0.75201108
## Dietary Fiber 0.32881795 0.17648702
## Protein 0.39921587 0.20119659
## Salt 0.36415357 0.28745082
#the first principal component is positively correlated with each variable except for Total Sugars, with which the correlation, although very low, is negative. The second principal component is highly & negatively correlated with Total Sugars. The correlation with Calories is significant while weaker than that with Total Sugars (still negative though).
# For a clear visualization of individuals and avoidance of overlapping text, only the thirty main contributors to the main axes have been represented
fviz_pca_ind (results.pca, axes = c(1, 2), geom = c("point", "text"), select.ind = list(contrib = 30), repel = TRUE, habillage=data_clean_2$Rating)
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#the vertical axis opposes products that are sweet (at the bottom we can find McFlurries, Sundaes and Fanta) to ones that salty/bitter (At the top we can find coffee, McWrap, Gran Crispy McBacon). We can call this axis "Saltiness/bitterness".
#the horizontal axis opposes products that are high in calories and fats (My Selection BBQ, My Selection Asiago DOP & Bacon, Gran Crispy McBacon) to products that are low in carbs and fats (coffee, Sprite, Coca Cola). Since the regression has shown that Total Fats are highly correlated with Calories, we can define this axis as "Nutritional Value".
# Save the first 2 principal components to the McDonalds dataframe
McDonalds_Menu_PC = data_clean_2
McDonalds_Menu_PC = cbind(data_clean_2, results.pca$x[, 1:2])
#Removing outliers identified during regression
McDonalds_Menu_PC <- McDonalds_Menu_PC[-c(15, 76, 109),]
#We can now perform a clustering analysis to describe clusters of products considering the two PCs we have identified.
#Define a seed
set.seed(3)
#to identify the optimal number of clusters, we will take into account Within Cluster Deviances decreasing, Silhouette Coefficients
# Elbow Methods for k-means Algorithm
standardized_numeric_data %>%
fviz_nbclust(FUN = kmeans, method = 'wss')
# Silhouette Methods for k-means Algorithm
standardized_numeric_data %>%
fviz_nbclust(FUN = kmeans, method = 'silhouette', verbose = TRUE)
#We have decided to extract 4 clusters. Although the silhouette method identifies 3 as the optimal number of clusters, we can see that there is an elbow point at 4. Moreover, the differences in average silhouette width between 3 and 4 clusters are marginal.
n_clusters = 4
# H. Ward's Method for creating cluster labels
res.hclust = hclust(dist(standardized_numeric_data), method = "ward.D")
# Adding cluster labels to data
clust_label = cutree(res.hclust, k = n_clusters)
McDonalds_Menu_PC = McDonalds_Menu_PC %>%
cbind(clust_label=clust_label)
#Plot of the cluster result based on the distribution on the 2 PCs
# Cluster by PC1 and PC2
plot_pc1_pc2 = cutree(res.hclust, k = n_clusters) %>%
data.frame() %>%
cbind(McDonalds_Menu_PC) %>%
ggplot() +
geom_point(aes(x = PC1,
y = PC2,
color = as.factor(.)
),
show.legend = T) +
geom_text(aes(x = PC1,
y = PC2,
color = as.factor(.)),
label=McDonalds_Menu_PC$name,
nudge_x=0.45, nudge_y=0.2,
check_overlap=T) +
labs(title = "Clusters by PC1 and PC2") +
geom_hline(yintercept = 0, colour="#999999") +
geom_vline(xintercept = 0, colour="#999999")
#plot
plot_pc1_pc2
#Red cluster: "Sugary Sweets"
#Green Cluster: "Fairly Healthy Snacks and Burgers"
#Blue cluster: "Low Cal"
#Purple cluster: "Unhealty Burgers"
# We now will apply the Hierarchical classification directly to the PCA result.
McDonalds_PC_Numeric = McDonalds_Menu_PC %>% dplyr::select(where(is.numeric))
McDonalds_PC_Numeric_std <- McDonalds_PC_Numeric
McDonalds_PC_Numeric_std$Calories_std = scale(McDonalds_PC_Numeric$Calories)
McDonalds_PC_Numeric_std$Total_Fats_std = scale(McDonalds_PC_Numeric$`Total Fats`)
McDonalds_PC_Numeric_std$Saturated_Fats_std = scale(McDonalds_PC_Numeric$`Saturated Fats`)
McDonalds_PC_Numeric_std$Total_Carbs_std = scale(McDonalds_PC_Numeric$`Total Carbs`)
McDonalds_PC_Numeric_std$Total_Sugars_std = scale(McDonalds_PC_Numeric$`Total Sugars`)
McDonalds_PC_Numeric_std$Dietary_Fiber_std = scale(McDonalds_PC_Numeric$`Dietary Fiber`)
McDonalds_PC_Numeric_std$Protein_std = scale(McDonalds_PC_Numeric$`Protein`)
McDonalds_PC_Numeric_std$Salt_std = scale(McDonalds_PC_Numeric$`Salt`)
McDonalds_PC_Numeric_std$PC1 = scale(McDonalds_PC_Numeric$`PC1`)
McDonalds_PC_Numeric_std$PC2 = scale(McDonalds_PC_Numeric$`PC2`)
# Remove unstandardized values
McDonalds_PC_Numeric_std <- McDonalds_PC_Numeric_std[ -c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14)]
# Check if means are equal to 0
summarise(McDonalds_PC_Numeric_std
, PC1 = mean(PC1)
, PC2 = mean(PC2)
)
## PC1 PC2
## 1 6.137963e-18 -1.058858e-17
# Check if standard deviations are equal to 1
summarise(McDonalds_PC_Numeric_std
, PC1 = sd(PC1)
, PC2 = sd(PC2)
)
## PC1 PC2
## 1 1 1
# Standard Deviation are equal to the square root of the eigenvalues!
summarise(McDonalds_Menu_PC
, PC1 = var(PC1)
, PC2 = var(PC2)
)
## PC1 PC2
## 1 3.701332 1.689065
# Choose the number of clusters
#We first create a new DF with the two principal components only
McDonalds_PC_Numeric_std_PC = McDonalds_PC_Numeric_std %>% select(PC1, PC2)
# Elbow Methods for k-means Algorithm
McDonalds_PC_Numeric_std_PC %>%
fviz_nbclust(FUN = kmeans, method = 'wss')
# Silhouette Methods for k-means Algorithm
McDonalds_PC_Numeric_std_PC %>%
fviz_nbclust(FUN = kmeans, method = 'silhouette', verbose = TRUE)
# After evaluating Within Cluster Deviances decreasing, and Silhouette Coefficients we have decided to extract 3 clusters
n_clusters_PCA = 3
# H. Ward's Method for creating cluster labels
res.hclust_PC = hclust(dist(McDonalds_PC_Numeric_std_PC), method = "ward.D")
# Cluster by PC1 and PC2
plot_pc1_pc2_pc = cutree(res.hclust_PC, k = n_clusters_PCA) %>%
data.frame() %>%
cbind(McDonalds_Menu_PC) %>%
ggplot() +
geom_point(aes(x = PC1,
y = PC2,
color = as.factor(.)
),
show.legend = T) +
geom_text(aes(x = PC1,
y = PC2,
color = as.factor(.)),
label=McDonalds_Menu_PC$name,
nudge_x=0.45, nudge_y=0.2,
check_overlap=T) +
labs(title = "Clusters (based on PC) by PC1 and PC2 ") +
geom_hline(yintercept = 0, colour="#999999") +
geom_vline(xintercept = 0, colour="#999999")
#plot
plot_pc1_pc2_pc
#Red cluster: "Sugary Sweets"
#Green cluster: "Salty/Low Sugar Food"
#Blue cluster: "Low Cal Food and condiments"
#As the number of clusters is only reduced from 4 to 3, and and as this has happened pretty much by uniting clusters 2&4 from the first hierarchical classification into one single cluster (cluster 2 from the second hierarchical clustering), we can conclude that the loss of information associated with reducing the variables to 2 PCs is not significant.