Project Work Team E

#Required library
library(blob)
library(cluster)
library(corrplot)

## Warning: le package 'corrplot' a 閠� compil� avec la version R 4.1.3

## corrplot 0.92 loaded

library(datasets)
library(DBI)
library(dbplyr)
library(dendextend)

## Warning: le package 'dendextend' a 閠� compil� avec la version R 4.1.3

## 
## ---------------------
## Welcome to dendextend version 1.15.2
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attachement du package : 'dendextend'

## L'objet suivant est masqué depuis 'package:stats':
## 
##     cutree

library(dplyr)

## 
## Attachement du package : 'dplyr'

## Les objets suivants sont masqués depuis 'package:dbplyr':
## 
##     ident, sql

## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag

## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union

library(factoextra)

## Warning: le package 'factoextra' a 閠� compil� avec la version R 4.1.3

## Le chargement a nécessité le package : ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(fmsb)

## Warning: le package 'fmsb' a 閠� compil� avec la version R 4.1.3

library(forcats)
library(formattable)

## Warning: le package 'formattable' a 閠� compil� avec la version R 4.1.3

library(GGally)

## Warning: le package 'GGally' a 閠� compil� avec la version R 4.1.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(ggplot2)
library(graphics)
library(grDevices)
library(magrittr)
library(methods)
library(patchwork)

## Warning: le package 'patchwork' a 閠� compil� avec la version R 4.1.3

## 
## Attachement du package : 'patchwork'

## L'objet suivant est masqué depuis 'package:formattable':
## 
##     area

library(plotly)

## Warning: le package 'plotly' a 閠� compil� avec la version R 4.1.3

## 
## Attachement du package : 'plotly'

## L'objet suivant est masqué depuis 'package:formattable':
## 
##     style

## L'objet suivant est masqué depuis 'package:ggplot2':
## 
##     last_plot

## L'objet suivant est masqué depuis 'package:stats':
## 
##     filter

## L'objet suivant est masqué depuis 'package:graphics':
## 
##     layout

library(plotrix)
library(plyr)

## Warning: le package 'plyr' a 閠� compil� avec la version R 4.1.3

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attachement du package : 'plyr'

## Les objets suivants sont masqués depuis 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(purrr)

## 
## Attachement du package : 'purrr'

## L'objet suivant est masqué depuis 'package:plyr':
## 
##     compact

## L'objet suivant est masqué depuis 'package:magrittr':
## 
##     set_names

library(readr)
library(readxl)
library(reprex)
library(Rsymphony)
library(splitstackshape)
library(stats)
library(stringr)
library(tibble)
library(tidyr)

## 
## Attachement du package : 'tidyr'

## L'objet suivant est masqué depuis 'package:magrittr':
## 
##     extract

library(tidyselect)
library(tidyverse)
library(utils)

#Import the dataset
library(readxl)
mcdonalds_it_20220221 <- read_excel("mcdonalds_it_20220221.xlsx")
data <- mcdonalds_it_20220221
View(data)
head(data)

## # A tibble: 6 x 15
##      ID `web-scraper-order` `web-scraper-s~` name  product_link `product_link-~`
##   <dbl> <chr>               <chr>            <chr> <lgl>        <chr>           
## 1     1 1631284741-140      https://www.mcd~ Milk~ NA           https://www.mcd~
## 2     2 1631285604-185      https://www.mcd~ Juni~ NA           https://www.mcd~
## 3     3 1631284758-141      https://www.mcd~ Cono  NA           https://www.mcd~
## 4     4 1631284516-126      https://www.mcd~ Capp~ NA           https://www.mcd~
## 5     5 1631285062-160      https://www.mcd~ Coca~ NA           https://www.mcd~
## 6     6 1631283842-95       https://www.mcd~ Mini~ NA           https://www.mcd~
## # ... with 9 more variables: Calories <dbl>, `Total Fats` <dbl>,
## #   `Saturated Fats` <dbl>, `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## #   `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <chr>

#Check dimension of the dataset
dim(data)

## [1] 124  15

#Removing columns we do not need (web scraper, web scraper-start-url, product_link)
data_clean <- data[ -c(2, 3, 5)]

#Removing missing values
data_clean_2 <- na.omit(data_clean)

#Converting characters in Rating variable into double
library(plyr)
data_clean_2$Rating <- revalue(data_clean_2$Rating,c("H"=2, "M"=1, "L"=0))
data_clean_2$Rating <- as.numeric(data_clean_2$Rating)

#Removing units of measure from numeric variables
Calories_1 <- as.numeric(as.character(substr(data_clean_2$Calories,1,4)))
TotalFats_1 <- as.numeric(as.character(substr(data_clean_2$`Total Fats`,1,4)))
SaturatedFats_1 <- as.numeric(as.character(substr(data_clean_2$`Saturated Fats`,1,4)))
TotalCarbs_1 <- as.numeric(as.character(substr(data_clean_2$`Total Carbs`,1,4)))
TotalSugars_1 <- as.numeric(as.character(substr(data_clean_2$`Total Sugars`,1,4)))
DietaryFiber_1 <- as.numeric(as.character(substr(data_clean_2$`Dietary Fiber`,1,4)))
Protein_1 <- as.numeric(as.character(substr(data_clean_2$Protein,1,4)))
Salt_1 <- as.numeric(as.character(substr(data_clean_2$Salt,1,4)))

data_clean_2$Calories <- Calories_1
data_clean_2$`Total Fats`<- TotalFats_1
data_clean_2$`Saturated Fats` <- SaturatedFats_1
data_clean_2$`Total Carbs`<-TotalCarbs_1
data_clean_2$`Total Sugars`<- TotalSugars_1
data_clean_2$`Dietary Fiber`<- DietaryFiber_1
data_clean_2$Protein <- Protein_1
data_clean_2$Salt <- Salt_1

#Synthesising the results of the cleaned dataset
summary(data_clean_2)

##        ID             name           product_link-href     Calories    
##  Min.   :  1.00   Length:120         Length:120         Min.   :  0.0  
##  1st Qu.: 31.75   Class :character   Class :character   1st Qu.: 93.5  
##  Median : 62.50   Mode  :character   Mode  :character   Median :234.5  
##  Mean   : 62.12                                         Mean   :241.4  
##  3rd Qu.: 92.25                                         3rd Qu.:328.5  
##  Max.   :123.00                                         Max.   :744.0  
##    Total Fats    Saturated Fats    Total Carbs     Total Sugars  
##  Min.   : 0.00   Min.   : 0.000   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 2.70   1st Qu.: 0.675   1st Qu.:12.00   1st Qu.: 4.85  
##  Median : 8.20   Median : 3.200   Median :26.50   Median : 8.95  
##  Mean   :10.69   Mean   : 4.371   Mean   :27.48   Mean   :14.33  
##  3rd Qu.:17.00   3rd Qu.: 6.325   3rd Qu.:42.00   3rd Qu.:22.00  
##  Max.   :46.00   Max.   :20.000   Max.   :89.00   Max.   :57.00  
##  Dietary Fiber      Protein            Salt            Rating     
##  Min.   :0.000   Min.   : 0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.: 0.975   1st Qu.:0.0800   1st Qu.:1.000  
##  Median :1.000   Median : 5.000   Median :0.2900   Median :2.000  
##  Mean   :1.277   Mean   : 8.176   Mean   :0.7586   Mean   :1.483  
##  3rd Qu.:2.100   3rd Qu.: 9.100   3rd Qu.:1.2000   3rd Qu.:2.000  
##  Max.   :4.800   Max.   :47.000   Max.   :4.8000   Max.   :2.000

#Check dimension of the cleaned dataset (data_clean_2)
dim(data_clean_2)

## [1] 120  12

#Creating "Category" variable to group all products into macro-areas
data_clean_2$Category = case_when (substring (data_clean_2$`product_link-href`,35,40)=='panini'~'Panini',
                                   substring(data_clean_2$`product_link-href`,35,50)=='gelati-e-dessert'~'Gelati e Dessert', 
                                   substring(data_clean_2$`product_link-href`,35,40)=='mccafe'~'McCafe',
                                   substring(data_clean_2$`product_link-href`,35,41)=='bevande'~'Bevande',
    
substring(data_clean_2$`product_link-href`,35,39)=='salse'~'Salse',

substring(data_clean_2$`product_link-href`,35,42)=='insalate'~'Insalate',

substring(data_clean_2$`product_link-href`,35,42)=='patatine'~'Patatine',

substring(data_clean_2$`product_link-href`,35,46)=='frescallegre'~'Frescallegre',

substring(data_clean_2$`product_link-href`,35,54)=='nuggets-e-sfiziosita'~'Nuggets e Sfiziosità')

#Creating categorical variable for McCafe
data_clean_2$'Bevande McCafe' = case_when (substring (data_clean_2$`product_link-href`,42,58)=='caffetteria-calda'~'Caffetteria Calda',
         
  substring(data_clean_2$`product_link-href`,42,58)=='succhi-e-spremute'~'Succhi e Spremute')

#Frequency table per Category with the percentages indicating the weight of each in the total
category_freq <- data_clean_2 %>% count('Category')

category_freq$perc <- (category_freq$freq)*100/nrow(data_clean_2)
category_freq

##               Category freq      perc
## 1              Bevande    9  7.500000
## 2         Frescallegre    3  2.500000
## 3     Gelati e Dessert   20 16.666667
## 4             Insalate    4  3.333333
## 5               McCafe   48 40.000000
## 6 Nuggets e Sfiziosità    4  3.333333
## 7               Panini   23 19.166667
## 8             Patatine    4  3.333333
## 9                Salse    5  4.166667

#Pie chart Category to understand the distribution of all categories
pie(table(data_clean_2$Category))

#Categories with the highest and lowest number of items
paste("The category with the highest number of products is:", category_freq$Category[category_freq$freq == max(category_freq$freq) ], "with:", max(category_freq$freq), "items. The category with the lowest number of product is:", category_freq$Category[category_freq$freq == min(category_freq$freq)], "with:", min(category_freq$freq), "items")

## [1] "The category with the highest number of products is: McCafe with: 48 items. The category with the lowest number of product is: Frescallegre with: 3 items"

#Exploring which are the categories in McCafe
McCafe <- data_clean_2 %>% filter(Category == 'McCafe')
McCafe <- McCafe[,c(2,13)]
McCafe

## # A tibble: 48 x 2
##    name                          Category
##    <chr>                         <chr>   
##  1 Cappuccino grande             McCafe  
##  2 Mini Donut cacao              McCafe  
##  3 Cremolosa                     McCafe  
##  4 Margherite con confettura     McCafe  
##  5 Cremolosa al caffè            McCafe  
##  6 Treccia alla crema            McCafe  
##  7 Mini Muffin cioccolato bianco McCafe  
##  8 Tè caldo e infusi             McCafe  
##  9 Succo di frutta pera          McCafe  
## 10 Muffin cioccolato             McCafe  
## # ... with 38 more rows

#Average calories in the whole menu 
library(tidyverse)
mean_cal = round(mean(data_clean_2$Calories),2) 
paste("The average calories in the whole menu is:", mean_cal)

## [1] "The average calories in the whole menu is: 241.42"

#Average Calories in Each Category
m_cal=aggregate(data_clean_2$Calories,by=list(type=data_clean_2$Category),mean)
data.frame("Category"=m_cal$type,"Average calories"=m_cal$x)

##               Category Average.calories
## 1              Bevande         92.11111
## 2         Frescallegre         29.66667
## 3     Gelati e Dessert        275.60000
## 4             Insalate        160.75000
## 5               McCafe        186.66667
## 6 Nuggets e Sfiziosità        186.50000
## 7               Panini        454.39130
## 8             Patatine        343.75000
## 9                Salse         73.00000

# Saturated Fats Boxplot by Category
ggplot(data_clean_2, aes(x=Category, y= data_clean_2$'Saturated Fats')) +
  geom_boxplot(aes(fill=Category), outlier.colour = "red") +
  coord_flip() +
  labs(title= "Saturated Fats Boxplot by Category") +
  xlab("") +
  ylab("Saturated Fats")

## Warning: Use of `data_clean_2$"Saturated Fats"` is discouraged. Use `Saturated
## Fats` instead.

#Correlation matrix to determine the strength and the direction of relationship between the variables.
corrplot(cor(data_clean_2 %>% select_if(is.numeric), method = "pearson"), method = "circle")

#In this matrix we can also understand what the Rating (0,1,2) is due to: it is strongly negatively correlated with Saturated Fats, very negatively correlated with Calories, Total Fats and Total Carbs, but poorly correlated with Dietary Fiber, Protein and Salt

#Let's get the total calories in the whole menu then see how much do "Beverages" and "Coffee" and "Spremute" contribute
menu_tot_cal = sum(data_clean_2$Calories)
paste("The total calories in the whole menu is:", menu_tot_cal)

## [1] "The total calories in the whole menu is: 28970"

#Now let's see by how much "Beverage" contribute to the overall caloric intake 
Bevande_total_cal = data_clean_2 %>%
                      filter(Category == "Bevande") %>%
                      select(Calories) %>%
                      slice(1:20) %>%
                      sum()
Bevande_percent = round((Bevande_total_cal / menu_tot_cal * 100),2)
paste("Bevarages contribute", Bevande_percent, "% to the total calories")

## [1] "Bevarages contribute 2.86 % to the total calories"

#Now let's see by how much "Coffee" contribute to the overall caloric intake 
Coffee_total_cal = data_clean_2 %>%
                      filter(data_clean_2$'Bevande McCafe' == "Caffetteria Calda") %>%
                      select("Calories") %>%
                      slice(9:n()) %>%
                      sum()
Coffee_percent = round((Coffee_total_cal / menu_tot_cal * 100),2)
paste("Coffee contributes", Coffee_percent, "% to the total calories")

## [1] "Coffee contributes 2.14 % to the total calories"

#Now let's see by how much "Spremute" contribute to the overall caloric intake 

succhi_e_spremute_total_cal =  data_clean_2%>%
                      filter(data_clean_2$`Bevande McCafe` == "Succhi e Spremute") %>%                   
                      select(Calories) %>%                                               
                      slice(1:20) %>%                             
                      sum()
Spremute_percent = round((succhi_e_spremute_total_cal / menu_tot_cal * 100),2)
paste("Spremute contributes", Spremute_percent, "% to the total calories")

## [1] "Spremute contributes 1.58 % to the total calories"

#Create 2 DataFrame: one for grilled chicken products and one for crispy chicken products
Grilled <- data_clean_2 %>% filter(name == "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano" | name == "Insalata con petto di pollo alla piastra e Parmigiano Reggiano" | name == "Junior Chicken" | name == "Chicken Country") 
df_Grilled <- Grilled[,c(2,4,12)]
df_Grilled

## # A tibble: 4 x 3
##   name                                                           Calories Rating
##   <chr>                                                             <dbl>  <dbl>
## 1 Junior Chicken                                                      206      2
## 2 Insalata con petto di pollo alla piastra e Parmigiano Reggiano      201      2
## 3 McWrap® con petto di pollo alla piastra e Parmigiano Reggiano       426      2
## 4 Chicken Country                                                     447      2

Crispy <- data_clean_2 %>% filter( name == "Chickenburger" | name == "McChicken® Originale" |name == "Double Chicken BBQ" | name == "My Selection Chicken Pepper" | ID == 22 | name == "Insalata con petto di pollo croccante e Parmigiano Reggiano")
df_Crispy <- Crispy[,c(2,4,12)]
df_Crispy

## # A tibble: 6 x 3
##   name                                                           Calories Rating
##   <chr>                                                             <dbl>  <dbl>
## 1 "McWrap® con petto di pollo croccante \r\ne Parmigiano Reggia~      535      1
## 2 "Insalata con petto di pollo croccante e Parmigiano Reggiano"       313      2
## 3 "Chickenburger"                                                     323      2
## 4 "McChicken® Originale"                                              446      2
## 5 "Double Chicken BBQ"                                                558      1
## 6 "My Selection Chicken Pepper"                                       636      1

#How does nutritional value change?
summary(df_Grilled)

##      name              Calories         Rating 
##  Length:4           Min.   :201.0   Min.   :2  
##  Class :character   1st Qu.:204.8   1st Qu.:2  
##  Mode  :character   Median :316.0   Median :2  
##                     Mean   :320.0   Mean   :2  
##                     3rd Qu.:431.2   3rd Qu.:2  
##                     Max.   :447.0   Max.   :2

summary(df_Crispy)

##      name              Calories         Rating   
##  Length:6           Min.   :313.0   Min.   :1.0  
##  Class :character   1st Qu.:353.8   1st Qu.:1.0  
##  Mode  :character   Median :490.5   Median :1.5  
##                     Mean   :468.5   Mean   :1.5  
##                     3rd Qu.:552.2   3rd Qu.:2.0  
##                     Max.   :636.0   Max.   :2.0

#Average calories are higher for products with Crispy Chicken (468,5 vs 320) and the mean of Rating is lower (1,5 vs 2)

#Comparison between Salad and McWrap (Grilled and Crispy)
#These are the only products we have in both Grilled and Crispy versions, so we can compare them directly

SW_Grilled = Grilled %>% filter(name == "Insalata con petto di pollo alla piastra e Parmigiano Reggiano" | name == "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano")

SW_Crispy =  Crispy %>% filter(name == "Insalata con petto di pollo croccante e Parmigiano Reggiano" | ID == 22)

SW_Grilled

## # A tibble: 2 x 14
##      ID name             `product_link-~` Calories `Total Fats` `Saturated Fats`
##   <dbl> <chr>            <chr>               <dbl>        <dbl>            <dbl>
## 1    31 Insalata con pe~ https://www.mcd~      201          8.2              4.6
## 2    77 McWrap® con pet~ https://www.mcd~      426         17                6  
## # ... with 8 more variables: `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## #   `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <dbl>,
## #   Category <chr>, `Bevande McCafe` <chr>

SW_Crispy

## # A tibble: 2 x 14
##      ID name             `product_link-~` Calories `Total Fats` `Saturated Fats`
##   <dbl> <chr>            <chr>               <dbl>        <dbl>            <dbl>
## 1    22 "McWrap® con pe~ https://www.mcd~      535           25              6.7
## 2    25 "Insalata con p~ https://www.mcd~      313           17              5.3
## # ... with 8 more variables: `Total Carbs` <dbl>, `Total Sugars` <dbl>,
## #   `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>, Rating <dbl>,
## #   Category <chr>, `Bevande McCafe` <chr>

#Merge the two dataframe
SW_merged <- rbind(SW_Grilled,SW_Crispy)
SW_merged <- SW_merged[,c(2,4)]
SW_merged

## # A tibble: 4 x 2
##   name                                                             Calories
##   <chr>                                                               <dbl>
## 1 "Insalata con petto di pollo alla piastra e Parmigiano Reggiano"      201
## 2 "McWrap® con petto di pollo alla piastra e Parmigiano Reggiano"       426
## 3 "McWrap® con petto di pollo croccante \r\ne Parmigiano Reggiano"      535
## 4 "Insalata con petto di pollo croccante e Parmigiano Reggiano"         313

#Bar Plot with grilled and cripsy chicken compared
SW_merged %>% 
  ggplot(aes(x= name, y= `Calories`)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Calories in Grilled and Crispy products", fill = "Category") +
  xlab("") +
  ylab("Calories")

#As we can see in the bar plot, both products contain more calories in the Crispy version

#With regard to the data available to us, we have chosen to focus on Saturated Fat and Salt, in order to construct a new Rating that indicates the product’s risk for cholesterol

Colest <- subset(data_clean_2, select=c(name, `Saturated Fats`, Salt))

# Standardize quantitative variables
Colest$SaturatedFats_stz = scale(Colest$`Saturated Fats`)
Colest$Salt_stz = scale(Colest$Salt)

# Check if means are equal to 0
summarise(Colest, 
          avg_SaturatedFats_stz = mean(SaturatedFats_stz), 
          avg_Salt_stz = mean(Salt_stz))

##   avg_SaturatedFats_stz  avg_Salt_stz
## 1         -4.919296e-17 -4.278352e-17

# Check if standard deviations are equal to 1
summarise(Colest,
          sd_SaturatedFats_stz = sd(SaturatedFats_stz),
          sd_Salt_stz = sd(Salt_stz))

##   sd_SaturatedFats_stz sd_Salt_stz
## 1                    1           1

# Choose the number of cluster based on Elbow Methods for k-means Algorithm:
Colest %>%
  select(SaturatedFats_stz, Salt_stz)  %>%
  fviz_nbclust(FUN = kmeans, method = 'wss')

#The number of clusters is 3

# Define a seed: 
# Setting random seed allows to initialize a pseudorandom number generator reproducing the same output
set.seed(3)

# Select Standardized columns
Colest_stz = Colest %>%
  select(SaturatedFats_stz, Salt_stz) 

# Performing Cluster Analysis with Kmeans algorithm ("iter.max" used to reduce the computational time) #Evaluate 20 different starting points and choose the best one
clusterK = kmeans(Colest_stz, centers = 3, nstart=20, iter.max=100)


# Plot k-means
clusterK  %>%
  fviz_cluster(geom = 'point', data = Colest_stz) +
  ggtitle("k-means clustering: k = 3") +
  theme(legend.position = 'bottom', 
        legend.title = element_blank())

# Print Within Deviance
print(paste("Within Deviance:", round(sum(clusterK$withinss), 1)))

## [1] "Within Deviance: 74.5"

# Plot Silhouette Coefficient
plot(silhouette(clusterK$cluster, dist(Colest_stz)))

Colest$Rating <- case_when(
  clusterK$cluster =='1' ~ "Low Risk",
  clusterK$cluster=='2' ~ "High Risk",
  clusterK$cluster=='3' ~ "Medium Risk")
View(Colest)

#Now in the Colest dataframe is possible to see if each product belongs to low, medium or high risk for cholesterol

What is the least number of items could you order from the menu to meet one day’s nutritional requirements?

European nutritional requirements extracted from UK government document: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/618167/government_dietary_recommendations.pdf Detailed requirements are averaged for both men and women, aged 19-64:

data.frame(
  "Factors"=c("Calories","Total Fats","Saturated Fats","Total Carbs","Total Sugars","Dietary Fiber","Protein","Salt"),
  "Requirements"=c("2250 kcal","less than 87.5g","less than 27.5g","at least 300g","less than 30g","at least 30g","50g","less than 6g")
)

##          Factors    Requirements
## 1       Calories       2250 kcal
## 2     Total Fats less than 87.5g
## 3 Saturated Fats less than 27.5g
## 4    Total Carbs   at least 300g
## 5   Total Sugars   less than 30g
## 6  Dietary Fiber    at least 30g
## 7        Protein             50g
## 8           Salt    less than 6g

For calories and protein, we will consider these requirements as minimum requirement (at least 2250 kcal and 50g, respectively.)

We use Rsymphony to find the best solution that satisfies all the requirements above.

#Get the number
library(Rsymphony)
obj <- c(rep(1, times=120))
mat <- matrix(c(data_clean_2$Calories,
                  data_clean_2$`Total Fats`,
                  data_clean_2$`Saturated Fats`,
                  data_clean_2$`Total Carbs`,
                  data_clean_2$`Total Sugars`,
                  data_clean_2$`Dietary Fiber`,
                  data_clean_2$Protein,
                  data_clean_2$Salt), nrow=8, byrow = T)
dir <- c(">=","<=","<=",">=","<=",">=",">=","<=")
rhs <- c(2250,87.5,27.5,300,30,30,50.25,6)
max <- FALSE
types <- c(rep("I", times=120))
objval = Rsymphony_solve_LP(obj,mat,dir,rhs, types = types,max = max)$objval
cat("The least number of items you could order from the menu to meet one day's nutritional requirements is", objval, ".")

## The least number of items you could order from the menu to meet one day's nutritional requirements is 15 .

Now we see further what are these 15 items that we should order. From the table below, we can see that the reason why there are 5 palatine in the mixed order is that palatine is rich in calories while low in salt and sugars; and the reason why there are 4 caffe is that caffe is rich in total carbs but low in every other category.

#See in detail what are the ordered items
number_of_orders = Rsymphony_solve_LP(obj,mat,dir,rhs, types = types,max = max)$solution
data_clean_2$`number of orders` <-number_of_orders
Orders_sorted = data_clean_2 %>%
  arrange(desc(`number of orders`))
Orders_sorted_1 <- Orders_sorted[which(Orders_sorted$`number of orders`!= 0),c(2,15,4,5,6,7,8,9,10,11)]
Orders_sorted_1

## # A tibble: 5 x 10
##   name     `number of ord~` Calories `Total Fats` `Saturated Fats` `Total Carbs`
##   <chr>               <dbl>    <dbl>        <dbl>            <dbl>         <dbl>
## 1 Patatin~                5      330         16                1.5          41  
## 2 Caffè g~                4       27          0                0             5.3
## 3 Insalat~                3       25          0.5              0             3  
## 4 Junior ~                2      206          2.8              0.6          30  
## 5 Mela                    1       41          0.2              0             9.1
## # ... with 4 more variables: `Total Sugars` <dbl>, `Dietary Fiber` <dbl>,
## #   Protein <dbl>, Salt <dbl>

Now we examine our results by checking the total nutritional factors of our order.

#Detailed menu
Total_Factors_Ordered <- data.frame(
  "Factors"=c("Calories","Total Fats (less than)","Saturated Fats(less than)","Total Carbs (at least)","Total Sugars (less than)","Dietary Fiber (at least)","Protein","Salt (less than)"),
  "Total Ordered"=c(crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$Calories), 
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$`Total Fats`),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$`Saturated Fats`),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$`Total Carbs`),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$`Total Sugars`),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$`Dietary Fiber`),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$Protein),
                    crossprod(Orders_sorted_1$`number of orders`,Orders_sorted_1$Salt)),
  "Requirements"=c(2250,87.5,27.5,300,30,30,50.25,6))
Total_Factors_Ordered

##                     Factors Total.Ordered Requirements
## 1                  Calories       2286.00      2250.00
## 2    Total Fats (less than)         87.30        87.50
## 3 Saturated Fats(less than)          8.70        27.50
## 4    Total Carbs (at least)        304.30       300.00
## 5  Total Sugars (less than)         28.50        30.00
## 6  Dietary Fiber (at least)         30.00        30.00
## 7                   Protein         58.40        50.25
## 8          Salt (less than)          5.77         6.00

Which McDonald’s menu items are high in Dietary Fiber? In Protein?

#Plots of top 10 in each factor

Calories_sorted = data_clean_2 %>%
  arrange(desc(Calories))
Calories_sorted_1 <- Calories_sorted[1:10,c(2,4,13)]
p1 <- Calories_sorted_1 %>% 
  mutate(name = fct_reorder(name,Calories)) %>%
  ggplot(aes(x= name, y= Calories, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Calories", fill = "Category") +
  xlab("") +
  ylab("Calories")

Total_Fats_sorted = data_clean_2 %>%
  arrange(desc(`Total Fats`))
Total_Fats_sorted_1 <- Total_Fats_sorted[1:10,c(2,5,13)]
p2 <- Total_Fats_sorted_1 %>% 
  mutate(name = fct_reorder(name,`Total Fats`)) %>%
  ggplot(aes(x= name, y= `Total Fats`, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Total Fats", fill = "Category") +
  xlab("") +
  ylab("Total Fats")

Saturated_Fats_sorted = data_clean_2 %>%
  arrange(desc(`Saturated Fats`))
Saturated_Fats_sorted_1 <- Saturated_Fats_sorted[1:10,c(2,6,13)]
p3 <- Saturated_Fats_sorted_1 %>% 
  mutate(name = fct_reorder(name,`Saturated Fats`)) %>%
  ggplot(aes(x= name, y= `Saturated Fats`, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Saturated Fats", fill = "Category") +
  xlab("") +
  ylab("Saturated Fats")

Total_Carbs_sorted = data_clean_2 %>%
  arrange(desc(`Total Carbs`))
Total_Carbs_sorted_1 <- Total_Carbs_sorted[1:10, c(2,7,13)]
p4 <- Saturated_Fats_sorted_1 %>% 
  mutate(name = fct_reorder(name,`Saturated Fats`)) %>%
  ggplot(aes(x= name, y= `Saturated Fats`, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Total Carbs", fill = "Category") +
  xlab("") +
  ylab("Total Carbs")

Total_Sugars_sorted = data_clean_2 %>%
  arrange(desc(`Total Sugars`))
Total_Sugars_sorted_1 <- Total_Sugars_sorted[1:10, c(2,8,13)]
p5 <- Total_Sugars_sorted_1 %>% 
  mutate(name = fct_reorder(name,`Total Sugars`)) %>%
  ggplot(aes(x= name, y= `Total Sugars`, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Total Sugars", fill = "Category") +
  xlab("") +
  ylab("Total Sugars")

Dietary_Fiber_sorted = data_clean_2 %>%
  arrange(desc(`Dietary Fiber`))
Dietary_Fiber_sorted_1 <- Dietary_Fiber_sorted[1:10, c(2,9,13)]
p6 <- Dietary_Fiber_sorted_1 %>% 
  mutate(name = fct_reorder(name,`Dietary Fiber`)) %>%
  ggplot(aes(x= name, y= `Dietary Fiber`, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Dietary Fiber", fill = "Category") +
  xlab("") +
  ylab("Dietary Fiber")

Protein_sorted = data_clean_2 %>%
  arrange(desc(Protein))
Protein_sorted_1 <- Protein_sorted[1:10, c(2,10,13)]
p7 <- Protein_sorted_1 %>% 
  mutate(name = fct_reorder(name,Protein)) %>%
  ggplot(aes(x= name, y= Protein, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Protein", fill = "Category") +
  xlab("") +
  ylab("Protein")

Salt_sorted = data_clean_2 %>%
  arrange(desc(Salt))
Salt_sorted_1 <- Salt_sorted[1:10, c(2,11,13)]
p8 <- Salt_sorted_1 %>% 
  mutate(name = fct_reorder(name,Salt)) %>%
  ggplot(aes(x= name, y= Salt, fill= Category)) +
  geom_bar(stat="identity", alpha = .5) +
  coord_flip() +
  labs(title = "Top 10 in Salt", fill = "Category") +
  xlab("") +
  ylab("Salt")

#Set the colors so the colors of categories are the same across plots
p1+scale_fill_manual(values=alpha(c("black","orange"),0.3))

p2+scale_fill_manual(values=alpha(c("black","orange"),0.3))

p3+scale_fill_manual(values=alpha(c("pink","black","orange"),0.3))

p4+scale_fill_manual(values=alpha(c("pink","black","orange"),0.3))

p5+scale_fill_manual(values=alpha(c("blue","pink","black"),0.3))

p6+scale_fill_manual(values=alpha(c("green","black","orange","red"),0.3))

p7+scale_fill_manual(values=alpha(c("green","orange"),0.3))

p8+scale_fill_manual(values=alpha(c("orange"),0.3))

KNN

#Divide the dataset into training set and test set. The percentage of sizes of training and test is 70% to 30%.
set.seed(1234)
index=sample(2,nrow(data_clean_2),replace=TRUE,prob=c(0.7,0.3))   

train = data_clean_2[index==1,c(2,4:12)]  
test = data_clean_2[index==2,c(2,4:12)]   

head(train)

## # A tibble: 6 x 10
##   name       Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
##   <chr>         <dbl>        <dbl>            <dbl>         <dbl>          <dbl>
## 1 Milkshake~      268          4.9              3.3          49             45  
## 2 Junior Ch~      206          2.8              0.6          30              6.6
## 3 Cono            140          3.6              2.1          24             17  
## 4 Cappuccin~      143          9                6             8.8            8.8
## 5 Mini Donu~       80          5                2.8           7.7            2.4
## 6 Gluten Fr~      457         25               13            33              4.2
## # ... with 4 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## #   Rating <dbl>

head(test)

## # A tibble: 6 x 10
##   name       Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
##   <chr>         <dbl>        <dbl>            <dbl>         <dbl>          <dbl>
## 1 Coca Cola®      168          0                0            42               42
## 2 Ketchup          33          0.1              0             7.4              6
## 3 Sundae Fr~      273          3.9              2.7          55               50
## 4 Tè caldo ~        0          0                0             0                0
## 5 McFlurry®~      453         17                8.2          66               54
## 6 McCrunchy~      231          6.6              2.2          35               12
## # ... with 4 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## #   Rating <dbl>

#Standardization of the training and test set
stdtrain = as.data.frame(scale(train[,-c(1,10)]))  
stdtest = as.data.frame(scale(test[,-c(1,10)]))   
summary(stdtrain[,1])

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.39299 -0.84942 -0.07455  0.00000  0.50371  2.90928

mean(stdtrain[,1])

## [1] -6.307026e-17

sd(stdtrain[,1])

## [1] 1

#Results of the KNN model
library(class)
knn = knn(stdtrain, stdtest, train$Rating, k=3) 

test$predictions = knn 
test$misclassified = test$Rating != test$predictions
head(test)

## # A tibble: 6 x 12
##   name       Calories `Total Fats` `Saturated Fats` `Total Carbs` `Total Sugars`
##   <chr>         <dbl>        <dbl>            <dbl>         <dbl>          <dbl>
## 1 Coca Cola®      168          0                0            42               42
## 2 Ketchup          33          0.1              0             7.4              6
## 3 Sundae Fr~      273          3.9              2.7          55               50
## 4 Tè caldo ~        0          0                0             0                0
## 5 McFlurry®~      453         17                8.2          66               54
## 6 McCrunchy~      231          6.6              2.2          35               12
## # ... with 6 more variables: `Dietary Fiber` <dbl>, Protein <dbl>, Salt <dbl>,
## #   Rating <dbl>, predictions <fct>, misclassified <lgl>

#Confusion matrix
t = table(knn,test$Rating) 
t

##    
## knn  0  1  2
##   0  1  0  0
##   1  0  9  2
##   2  0  1 14

#Calculate accuracy
nrow(test)

## [1] 27

accuracy = sum(diag(t))/nrow(test) 
error_rate = 1 - accuracy
cat("Accuracy = ", accuracy, ", error rate = ", error_rate)

## Accuracy =  0.8888889 , error rate =  0.1111111

It turns out that our KNN model is quite accurate! So we can go on to see which the best k is.

knn_i = NULL
error_rate = NULL

for(i in 1:30)   
  {
    knn_i = knn(stdtrain, stdtest, train$Rating, k=i)
    error_rate[i] = mean(knn_i != test$Rating)   
  }

rm(knn_i)
best_k= which.min(error_rate)

cat("Best k =", best_k)

## Best k = 3

We’re already using the best k! So we don’t need to redo the KNN.

Now let’s use this model to do some interesting things. There are some food that people believe are healthy, but actually are not that healthy. We found several food like this on the website of Carrefour, and we wonder how these food would be classified if it were a McDonald’s food. Then we can see whether they’re healthy or not.

#Input data of the "healthy" good
healthy_test <- data.frame(
  "name"=c("Smoothie innocent 75cl","Smoothie innocent no sugar added 75cl","Smoothie innocent 33cl","Cereal & fruit biscuit bio nutri+ 218g","Cereal & fruit biscuit bio nutri+ 55g","Yogurt Carrefour 125g","Yogurt Carrefour 250g","Salad Roma Sodebo (with oil sauce and a cookie) 320g","Penne Chicken salad with Caesar sauce 250g","Salad piémontaise bonduelle 320g"),
  "Calories"=c(410.3,397.5,138,844,211,110,220,682,440,419),
  "Total Fats"=c(0,0.8,0,15.3,3.8,3.6,7.2,35.2,25,30.4),
  "Saturated Fats"=c(0,0.8,0,2.4,0.6,2.4,4.8,4.8,5.5,3.2),
  "Total Carbs"=c(90,89.3,37.3,154.8,38.7,15,30,64,35,20.8),
  "Total Sugar"=c(75,81.8,33,85,21.3,15,30,16.6,33,3.2),
  "Dietary Fiber"=c(4.5,9.8,0,13.1,3.3,6,1.2,8.6,4.3,4.8),
  "Protein"=c(1.5,3.8,0,15.5,3.9,4.1,8.2,23,16.5,12.8),
  "Salt"=c(0,0,0,1.5,0.4,0.1,0.2,2.1,3.3,2.9))
healthy_test

##                                                    name Calories Total.Fats
## 1                                Smoothie innocent 75cl    410.3        0.0
## 2                 Smoothie innocent no sugar added 75cl    397.5        0.8
## 3                                Smoothie innocent 33cl    138.0        0.0
## 4                Cereal & fruit biscuit bio nutri+ 218g    844.0       15.3
## 5                 Cereal & fruit biscuit bio nutri+ 55g    211.0        3.8
## 6                                 Yogurt Carrefour 125g    110.0        3.6
## 7                                 Yogurt Carrefour 250g    220.0        7.2
## 8  Salad Roma Sodebo (with oil sauce and a cookie) 320g    682.0       35.2
## 9            Penne Chicken salad with Caesar sauce 250g    440.0       25.0
## 10                     Salad piémontaise bonduelle 320g    419.0       30.4
##    Saturated.Fats Total.Carbs Total.Sugar Dietary.Fiber Protein Salt
## 1             0.0        90.0        75.0           4.5     1.5  0.0
## 2             0.8        89.3        81.8           9.8     3.8  0.0
## 3             0.0        37.3        33.0           0.0     0.0  0.0
## 4             2.4       154.8        85.0          13.1    15.5  1.5
## 5             0.6        38.7        21.3           3.3     3.9  0.4
## 6             2.4        15.0        15.0           6.0     4.1  0.1
## 7             4.8        30.0        30.0           1.2     8.2  0.2
## 8             4.8        64.0        16.6           8.6    23.0  2.1
## 9             5.5        35.0        33.0           4.3    16.5  3.3
## 10            3.2        20.8         3.2           4.8    12.8  2.9

As we can see above, we mainly focus on 4 types of perceived healthy food: smoothie, cereal biscuit, fruit yogurt and salad.

#Standardization
stdhealthytest = as.data.frame(scale(healthy_test[,-c(1,10)])) 
stdmcdonalds = as.data.frame(scale(data_clean_2[,-c(1,2,3,12,13,14,15)]))
summary(stdhealthytest[,1])

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.17839 -0.73944  0.07108  0.00000  0.20224  1.94210

summary(stdmcdonalds[,1])

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.41824 -0.86896 -0.04063  0.00000  0.51158  2.95250

Now let’s use our KNN model to find out whether these food are really healthy or not (using the standard of McDonald’s.)

#KNN result
knn1 = knn(stdmcdonalds, stdhealthytest, data_clean_2$Rating, k=3) 
healthy_test$predictions = knn1 
healthy_test[,c(1,10)]

##                                                    name predictions
## 1                                Smoothie innocent 75cl           2
## 2                 Smoothie innocent no sugar added 75cl           2
## 3                                Smoothie innocent 33cl           2
## 4                Cereal & fruit biscuit bio nutri+ 218g           1
## 5                 Cereal & fruit biscuit bio nutri+ 55g           2
## 6                                 Yogurt Carrefour 125g           2
## 7                                 Yogurt Carrefour 250g           1
## 8  Salad Roma Sodebo (with oil sauce and a cookie) 320g           1
## 9            Penne Chicken salad with Caesar sauce 250g           1
## 10                     Salad piémontaise bonduelle 320g           2

Now we can see some of them are not rated highly, even by McDonald’s standard (which is not high!). The results are not the same every time, and sometimes the KNN model even classifies Smoothie 75cl as 1 (medium). So, when drinking sugar added smoothie and fruit yoghurt, or eating cereal biscuit (which claims to be bio and nutri+) and salad with sauce and biscuit, be careful! Don’t eat too much because they’re not as healthy as you think!

#How to create a menu including only healthy food?
#menu = panino + bevanda + salsa + patatine + nuggets e sfiziosità + gelati e dessert
#healthy = rating 2

healthy_panino <- filter(data_clean_2, Category == 'Panini' & Rating == '2')
healthy_bevanda <- filter(data_clean_2, Category == 'Bevande' & Rating == '2')
healthy_salsa <- filter(data_clean_2, Category == 'Salse' & Rating == '2')
healthy_Nuggets_e_Sfiziosità <- filter(data_clean_2, Category == 'Nuggets e Sfiziosità' & Rating == '2')
healthy_patatine <- filter(data_clean_2, Category == 'Patatine' & Rating == '2')
healthy_gelati_e_dessert <- filter(data_clean_2, Category == 'Gelati e Dessert' & Rating == '2')

panino_HM <- sample(healthy_panino$name, 1, replace = FALSE, prob = NULL)
bevanda_HM <- sample(healthy_bevanda$name, 1, replace = FALSE, prob = NULL)
salsa_HM <- sample(healthy_salsa$name, 1, replace = FALSE, prob = NULL)
bevanda_HM <- sample(healthy_bevanda$name, 1, replace = FALSE, prob = NULL)
nuggets_e_sfiziosità_HM <- sample(healthy_Nuggets_e_Sfiziosità$name, 1, replace = FALSE, prob = NULL)
patatine_HM <- sample(healthy_patatine$name, 1, replace = FALSE, prob = NULL)
gelati_e_dessert_HM <- sample(healthy_gelati_e_dessert$name, 1, replace = FALSE, prob = NULL)


healthy_menu <- data.frame(panino_HM, bevanda_HM, salsa_HM, nuggets_e_sfiziosità_HM, patatine_HM, gelati_e_dessert_HM)

View(healthy_menu)

#This chunk of code rendomly creates a McDonal's menu containing a burger, a drink, a sauce, chips, nuggets and snacks, and ice cream and desserts, each with a rating of 2 (High)

#Which factors affects the nutritional values?
#We have decided to run a regression to identify which among the factors affects the variable calories (that we have takesn as nutritional value)
library(ISLR2)
numeric_data <- data_clean_2[ -c(1, 2,3, 12, 13, 14, 15)]
standardized_numeric_data <- numeric_data


# Standardize quantitative variables
standardized_numeric_data$Calories_std = scale(numeric_data$Calories)
standardized_numeric_data$Total_Fats_std = scale(numeric_data$`Total Fats`)
standardized_numeric_data$Saturated_Fats_std = scale(numeric_data$`Saturated Fats`)
standardized_numeric_data$Total_Carbs_std = scale(numeric_data$`Total Carbs`)
standardized_numeric_data$Total_Sugars_std = scale(numeric_data$`Total Sugars`)
standardized_numeric_data$Dietary_Fiber_std = scale(numeric_data$`Dietary Fiber`)
standardized_numeric_data$Protein_std = scale(numeric_data$`Protein`)
standardized_numeric_data$Salt_std = scale(numeric_data$`Salt`)


# Check if means are equal to 0
summarise(standardized_numeric_data
          , avg_Calories_std = mean(Calories_std)
          , avg_Total_Fats_std = mean(Total_Fats_std)
          , avg_Saturated_Fats_std = mean(Saturated_Fats_std)
          , avg_Total_Carbs_std = mean(Total_Carbs_std)
          , avg_Total_Sugars_std = mean(Total_Sugars_std)
          , avg_Dietary_Fiber_std = mean(Dietary_Fiber_std)
          , avg_Protein_std = mean(Protein_std)
          , avg_Salt_std = mean(Salt_std))

##   avg_Calories_std avg_Total_Fats_std avg_Saturated_Fats_std
## 1     6.361121e-17       6.938894e-17          -4.919296e-17
##   avg_Total_Carbs_std avg_Total_Sugars_std avg_Dietary_Fiber_std
## 1        2.810659e-17         1.516076e-17           4.44035e-17
##   avg_Protein_std  avg_Salt_std
## 1    1.181893e-17 -4.278352e-17

# Check if standar deviations are equal to 1
summarise(standardized_numeric_data
          , sd_Calories_std = sd(Calories_std)
          , sd_Total_Fats_std = sd(Total_Fats_std)
          , sd_Saturated_Fats_std = sd(Saturated_Fats_std)
          , sd_Total_Carbs_std = sd(Total_Carbs_std)
          , sd_Total_Sugars_std = sd(Total_Sugars_std)
          , sd_Dietary_Fiber_std = sd(Dietary_Fiber_std)
          , sd_Protein_std = sd(Protein_std)
          , sd_Salt_std = sd(Salt_std))

##   sd_Calories_std sd_Total_Fats_std sd_Saturated_Fats_std sd_Total_Carbs_std
## 1               1                 1                     1                  1
##   sd_Total_Sugars_std sd_Dietary_Fiber_std sd_Protein_std sd_Salt_std
## 1                   1                    1              1           1

# Remove unstandardized values
standardized_numeric_data <- standardized_numeric_data[ -c(1, 2, 3, 4, 5, 6, 7, 8)]

# 9.(a)  Produce a scatterplot matrix which includes all of the variables in the data set.

pairs(standardized_numeric_data, lower.panel = NULL)

#produce a boxplot for each variable and identify outliers

boxplot(standardized_numeric_data$Calories_std,
        ylab = "Calories",
        main = "Boxplot with Calories",
      )

boxplot(standardized_numeric_data$Total_Fats_std,
        ylab = "Total Fats",
        main = "Boxplot with Total Fats",
      )

boxplot(standardized_numeric_data$Saturated_Fats_std,
        ylab = "Saturated Fats",
        main = "Boxplot with Saturated Fats",
      )

boxplot(standardized_numeric_data$Total_Carbs_std,
        ylab = "Total Carbs",
        main = "Boxplot with Total Carbs",
      )

boxplot(standardized_numeric_data$Total_Sugars_std,
        ylab = "Total Sugars",
        main = "Boxplot with Total Sugars",
      )

boxplot(standardized_numeric_data$Dietary_Fiber_std,
        ylab = "Dietary Fiber",
        main = "Boxplot with Dietary Fiber",
      )

boxplot(standardized_numeric_data$Protein_std,
        ylab = "Protein",
        main = "Boxplot with Protein",
      )

boxplot(standardized_numeric_data$Salt_std,
        ylab = "Salt",
        main = "Boxplot with Salt",
      )

library(dplyr)
#removing outliers for Calories
cal_out <- boxplot.stats(standardized_numeric_data$Calories_std)$out
cal_out_ind <- which(standardized_numeric_data$Calories_std %in% c(cal_out))
cal_out_ind

## [1]  15  76 109

#three outliers have been identified in rows 15, 76 and 109

standardized_numeric_data <- standardized_numeric_data[-c(15, 76, 109),]
#the three rows have been removed from the standardized_numeric_data df

tot_fats_out <- boxplot.stats(standardized_numeric_data$Total_Fats_std)$out
tot_fats_out_ind <- which(standardized_numeric_data$Total_Fats_std %in% c(tot_fats_out))
tot_fats_out_ind

## integer(0)

#total fats seems to present no outliers

sat_fats_out <- boxplot.stats(standardized_numeric_data$Saturated_Fats_std)$out
sat_fats_out_ind <- which(standardized_numeric_data$Saturated_Fats_std %in% c(sat_fats_out))
sat_fats_out_ind

## integer(0)

#saturated fats seems to present no outliers

tot_carbs_out <- boxplot.stats(standardized_numeric_data$Total_Carbs_std)$tot_carbs_out
tot_carbs_out_ind <- which(standardized_numeric_data$Total_Carbs_std %in% c(tot_carbs_out))
tot_carbs_out_ind

## integer(0)

#total carbs seems to present no outliers

tot_sugars_out <- boxplot.stats(standardized_numeric_data$Total_Sugars_std)$tot_sugars_out
tot_sugars_out_ind <- which(standardized_numeric_data$Total_Sugars_std %in% c(tot_sugars_out))
tot_sugars_out_ind

## integer(0)

#total sugars seems to present no outliers

diet_fiber_out <- boxplot.stats(standardized_numeric_data$Dietary_Fiber_std)$diet_fiber_out
diet_fiber_out_ind <- which(standardized_numeric_data$Dietary_Fiber_std %in% c(diet_fiber_out))
diet_fiber_out_ind

## integer(0)

#dietary fiber seems to present no outliers

diet_fiber_out <- boxplot.stats(standardized_numeric_data$Dietary_Fiber_std)$diet_fiber_out
diet_fiber_out_ind <- which(standardized_numeric_data$Dietary_Fiber_std %in% c(diet_fiber_out))
diet_fiber_out_ind

## integer(0)

#dietary fiber seems to present no outliers

protein_out <- boxplot.stats(standardized_numeric_data$Protein_std)$protein_out
protein_out_ind <- which(standardized_numeric_data$Protein_std %in% c(protein_out))
protein_out_ind

## integer(0)

#protein seems to present no outliers

salt_out <- boxplot.stats(standardized_numeric_data$Salt_std)$salt_out
salt_out_ind <- which(standardized_numeric_data$Protein_std %in% c(protein_out))
protein_out_ind

## integer(0)

#salt seems to present no outliers

df1_cor = cor(standardized_numeric_data) #calculate correlation between all the variables 

corrplot(df1_cor, type='upper',  diag = FALSE)

rm(df1_cor)
#there seems to be high positive correlation between total and saturated fats, between protein and salt. For what concerns the DV Calories, it seems to be positively correlated with all the IV (low correlation with Total Sugars)

set.seed(3)


lm.fit = lm(Calories_std~Total_Fats_std+Saturated_Fats_std+Total_Carbs_std+Total_Sugars_std+Dietary_Fiber_std+Protein_std+Salt_std, data=standardized_numeric_data) #linear regression model including all the IVs


summary(lm.fit)

## 
## Call:
## lm(formula = Calories_std ~ Total_Fats_std + Saturated_Fats_std + 
##     Total_Carbs_std + Total_Sugars_std + Dietary_Fiber_std + 
##     Protein_std + Salt_std, data = standardized_numeric_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.063565 -0.010458 -0.001331  0.009966  0.095466 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.0007969  0.0017432   0.457 0.648463    
## Total_Fats_std      0.5320369  0.0051273 103.766  < 2e-16 ***
## Saturated_Fats_std -0.0018396  0.0040470  -0.455 0.650320    
## Total_Carbs_std     0.4183789  0.0040763 102.636  < 2e-16 ***
## Total_Sugars_std   -0.0050930  0.0038386  -1.327 0.187348    
## Dietary_Fiber_std   0.0105058  0.0026311   3.993 0.000119 ***
## Protein_std         0.2289911  0.0050844  45.038  < 2e-16 ***
## Salt_std            0.0001195  0.0046882   0.025 0.979708    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01864 on 109 degrees of freedom
## Multiple R-squared:  0.9996, Adjusted R-squared:  0.9996 
## F-statistic: 3.821e+04 on 7 and 109 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lm.fit)

#checking for multicollinearity
car::vif(lm.fit)

##     Total_Fats_std Saturated_Fats_std    Total_Carbs_std   Total_Sugars_std 
##           6.463850           3.979611           5.581175           5.015693 
##  Dietary_Fiber_std        Protein_std           Salt_std 
##           2.229444           5.895822           6.494178

#building a model without Total_Fats_std (high VIF, highly correlated with Saturated fats) and Salt_std (high VIF, highly correlated with Protein)
lm.fit2 = lm(Calories_std~Saturated_Fats_std+Total_Carbs_std+Total_Sugars_std+Dietary_Fiber_std+Protein_std, data=standardized_numeric_data) #linear regression model
summary(lm.fit2)

## 
## Call:
## lm(formula = Calories_std ~ Saturated_Fats_std + Total_Carbs_std + 
##     Total_Sugars_std + Dietary_Fiber_std + Protein_std, data = standardized_numeric_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44523 -0.08157 -0.02424  0.07466  0.81765 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.0004543  0.0172279  -0.026 0.979011    
## Saturated_Fats_std  0.3304249  0.0240379  13.746  < 2e-16 ***
## Total_Carbs_std     0.5348011  0.0358468  14.919  < 2e-16 ***
## Total_Sugars_std   -0.1180580  0.0326605  -3.615 0.000454 ***
## Dietary_Fiber_std   0.0849339  0.0249466   3.405 0.000923 ***
## Protein_std         0.3215030  0.0298336  10.777  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1846 on 111 degrees of freedom
## Multiple R-squared:  0.9593, Adjusted R-squared:  0.9575 
## F-statistic: 523.8 on 5 and 111 DF,  p-value: < 2.2e-16

par(mfrow=c(2,2))
plot(lm.fit2)

#checking for multicollinearity
car::vif(lm.fit2)

## Saturated_Fats_std    Total_Carbs_std   Total_Sugars_std  Dietary_Fiber_std 
##           1.432278           4.402877           3.704184           2.044527 
##        Protein_std 
##           2.070755

#no multicollinearity detected, therefore the second model is selected
#the F statistics is high (523,8), therefore the null hypothesis of homogeneity of variance between the predictors is rejected (also the p-value is statistically significant alpha≈0)
#residuals are roughly centered around zero (1Q=-0.082, median=-0.02, 3Q=0,074), therefore the model fits the assumption of heteroscedasticity
#the predictors are able to explain 96% of the variance in Calories (R-square=0.96)
#Saturated fats is significantly (p-value≈0) and positively correlated with Calories
#Total carbs is significantly (p-value≈0) and positively correlated with Calories
#Total sugars is significantly (p-value=0.0005) and negatively correlated with Calories
#Dietary fiber is significantly, although slightly less than other predictors (p-value=0,001), and positively correlated with Calories
#Protein is significantly (p-value≈0) and positively correlated with Calories

#Principal Component Analysis
# The objectives of this analysis are to:

# 1. Identify groups of similar products in McDonald's menu
# 2. determine which variables make one group different from another



library(ggplot2)
library(FactoMineR)

## Warning: le package 'FactoMineR' a 閠� compil� avec la version R 4.1.3

library(factoextra)
library(corrplot)
library(PerformanceAnalytics)

## Warning: le package 'PerformanceAnalytics' a 閠� compil� avec la version R
## 4.1.3

## Le chargement a nécessité le package : xts

## Warning: le package 'xts' a 閠� compil� avec la version R 4.1.3

## Le chargement a nécessité le package : zoo

## Warning: le package 'zoo' a 閠� compil� avec la version R 4.1.3

## 
## Attachement du package : 'zoo'

## Les objets suivants sont masqués depuis 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attachement du package : 'xts'

## Les objets suivants sont masqués depuis 'package:dplyr':
## 
##     first, last

## 
## Attachement du package : 'PerformanceAnalytics'

## L'objet suivant est masqué depuis 'package:graphics':
## 
##     legend

library(readxl)
library(dplyr)

# plot distributions and correlations for each pair of variables
chart.Correlation(numeric_data, histogram=TRUE, pch=19)

#as observed in the regression analysis, there is a strong positive correlation between Calories and Total Fats, Calories and saturated fats, total fats and saturated fats, and salt and protein. Negative correlations only involve total sugars (with dietary fiber, protein, and salt), neverthelss is low (highest negative correlation is between salt and sugar, with a correlation coeff. of -0.28)

#Calculating the principal components (data is standardized and normalized)
results.pca = prcomp(numeric_data, scale=TRUE)

#scree plot
fviz_eig(results.pca, addlabels = TRUE, ylim = c(0, 70))

# Print cumulative explained variance
round(get_eigenvalue(results.pca), 4)#[0:5,]

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1     4.8390          60.4873                     60.4873
## Dim.2     1.6724          20.9052                     81.3925
## Dim.3     0.6647           8.3088                     89.7013
## Dim.4     0.5340           6.6755                     96.3768
## Dim.5     0.1374           1.7169                     98.0938
## Dim.6     0.0866           1.0822                     99.1760
## Dim.7     0.0657           0.8211                     99.9971
## Dim.8     0.0002           0.0029                    100.0000

# Evaluate eigenvalues
results.pca$sdev^2

## [1] 4.838982168 1.672416522 0.664706280 0.534040545 0.137354609 0.086577999
## [7] 0.065690693 0.000231184

# Looking at the eigenvalues, the first two dimensions contain 81.4% of the total variance. This percentage is acceptable to keep the first two main axes for the rest of our analysis.

##### Interpretation Phase ###########################
# Change of the cartesian system (eigenvectors in R point in the negative direction by default)
results.pca$rotation=-results.pca$rotation
results.pca$x=-results.pca$x

# Graph of variables (cos2 = the quality of the variables on the factor map, calculated as the squared coordinates)
fviz_pca_var(results.pca, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

#The factor map shows that the vertical axis almost coincides with the amount of sugar contained in the product (products high in sugar are low in the graph). Total Fats almost lies on the horizontal axis, although Calories, a little bit lower than Total Fats with regards to the horizontal axis, have a higher quality. Since the regression has shown that Total Fats are highly correlated with Calories, we can define this axis as Nutritional Value. 

var = get_pca_var(results.pca)

corrplot(var$cos2, is.corr=FALSE)

#the same results as the factor map are displayed 

# Interpretation of the Principal Components: loadings
results.pca$rotation[, 1:2]

##                       PC1         PC2
## Calories       0.44466763 -0.13311501
## Total Fats     0.42254669  0.04568254
## Saturated Fats 0.36245948 -0.10458761
## Total Carbs    0.30316436 -0.49947828
## Total Sugars   0.01836078 -0.75201108
## Dietary Fiber  0.32881795  0.17648702
## Protein        0.39921587  0.20119659
## Salt           0.36415357  0.28745082

#the first principal component is positively correlated with each variable except for Total Sugars, with which the correlation, although very low, is negative. The second principal component is highly & negatively correlated with Total Sugars. The correlation with Calories is significant while weaker than that with Total Sugars (still negative though).


# For a clear visualization of individuals and avoidance of overlapping text, only the thirty main contributors to the main axes have been represented 

fviz_pca_ind (results.pca, axes = c(1, 2), geom = c("point", "text"), select.ind = list(contrib = 30), repel = TRUE, habillage=data_clean_2$Rating)

## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#the vertical axis opposes products that are sweet (at the bottom we can find McFlurries, Sundaes and Fanta) to ones that salty/bitter (At the top we can find coffee, McWrap, Gran Crispy McBacon). We can call this axis "Saltiness/bitterness". 
#the horizontal axis opposes products that are high in calories and fats (My Selection BBQ, My Selection Asiago DOP & Bacon, Gran Crispy McBacon) to products that are low in carbs and fats (coffee, Sprite, Coca Cola). Since the regression has shown that Total Fats are highly correlated with Calories, we can define this axis as "Nutritional Value". 

# Save the first 2 principal components to the McDonalds dataframe
McDonalds_Menu_PC = data_clean_2
McDonalds_Menu_PC = cbind(data_clean_2, results.pca$x[, 1:2])

#Removing outliers identified during regression
McDonalds_Menu_PC <- McDonalds_Menu_PC[-c(15, 76, 109),]

#We can now perform a clustering analysis to describe clusters of products considering the two PCs we have identified.

#Define a seed
set.seed(3)

#to identify the optimal number of clusters, we will take into account Within Cluster Deviances decreasing, Silhouette Coefficients 

# Elbow Methods for k-means Algorithm
standardized_numeric_data  %>%
  fviz_nbclust(FUN = kmeans, method = 'wss')

# Silhouette Methods for k-means Algorithm
standardized_numeric_data %>%
  fviz_nbclust(FUN = kmeans, method = 'silhouette', verbose = TRUE)

#We have decided to extract 4 clusters. Although the silhouette method identifies 3 as the optimal number of clusters, we can see that there is an elbow point at 4. Moreover, the differences in average silhouette width between 3 and 4 clusters are marginal. 

n_clusters = 4

# H. Ward's Method for creating cluster labels
res.hclust = hclust(dist(standardized_numeric_data), method = "ward.D")


# Adding cluster labels to data
clust_label = cutree(res.hclust, k = n_clusters)
McDonalds_Menu_PC = McDonalds_Menu_PC %>% 
  cbind(clust_label=clust_label)

#Plot of the cluster result based on the distribution on the 2 PCs


# Cluster by PC1 and PC2
plot_pc1_pc2 = cutree(res.hclust, k = n_clusters) %>% 
  data.frame() %>% 
  cbind(McDonalds_Menu_PC) %>%  
  ggplot() + 
  geom_point(aes(x = PC1,
                 y = PC2,
                 color = as.factor(.)
  ),
  show.legend = T) +
  geom_text(aes(x = PC1,
                y = PC2,
                color = as.factor(.)), 
            label=McDonalds_Menu_PC$name,
            nudge_x=0.45, nudge_y=0.2,
            check_overlap=T) +
  labs(title = "Clusters by PC1 and PC2") +
  geom_hline(yintercept = 0, colour="#999999") +
  geom_vline(xintercept = 0, colour="#999999")


#plot
plot_pc1_pc2

#Red cluster: "Sugary Sweets"
#Green Cluster: "Fairly Healthy Snacks and Burgers"
#Blue cluster:  "Low Cal"
#Purple cluster: "Unhealty Burgers"

# We now will apply the Hierarchical classification directly to the PCA result.
McDonalds_PC_Numeric = McDonalds_Menu_PC %>% dplyr::select(where(is.numeric))

McDonalds_PC_Numeric_std <- McDonalds_PC_Numeric

McDonalds_PC_Numeric_std$Calories_std = scale(McDonalds_PC_Numeric$Calories)
McDonalds_PC_Numeric_std$Total_Fats_std = scale(McDonalds_PC_Numeric$`Total Fats`)
McDonalds_PC_Numeric_std$Saturated_Fats_std = scale(McDonalds_PC_Numeric$`Saturated Fats`)
McDonalds_PC_Numeric_std$Total_Carbs_std = scale(McDonalds_PC_Numeric$`Total Carbs`)
McDonalds_PC_Numeric_std$Total_Sugars_std = scale(McDonalds_PC_Numeric$`Total Sugars`)
McDonalds_PC_Numeric_std$Dietary_Fiber_std = scale(McDonalds_PC_Numeric$`Dietary Fiber`)
McDonalds_PC_Numeric_std$Protein_std = scale(McDonalds_PC_Numeric$`Protein`)
McDonalds_PC_Numeric_std$Salt_std = scale(McDonalds_PC_Numeric$`Salt`)
McDonalds_PC_Numeric_std$PC1 = scale(McDonalds_PC_Numeric$`PC1`)
McDonalds_PC_Numeric_std$PC2 = scale(McDonalds_PC_Numeric$`PC2`)

# Remove unstandardized values
McDonalds_PC_Numeric_std <- McDonalds_PC_Numeric_std[ -c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14)]

# Check if means are equal to 0
summarise(McDonalds_PC_Numeric_std
          , PC1 = mean(PC1)
          , PC2 = mean(PC2)
)

##            PC1           PC2
## 1 6.137963e-18 -1.058858e-17

# Check if standard deviations are equal to 1
summarise(McDonalds_PC_Numeric_std
          , PC1 = sd(PC1)
          , PC2 = sd(PC2)
)

##   PC1 PC2
## 1   1   1

# Standard Deviation are equal to the square root of the eigenvalues!
summarise(McDonalds_Menu_PC
          , PC1 = var(PC1)
          , PC2 = var(PC2)
)

##        PC1      PC2
## 1 3.701332 1.689065

# Choose the number of clusters
#We first create a new DF with the two principal components only
McDonalds_PC_Numeric_std_PC = McDonalds_PC_Numeric_std %>% select(PC1, PC2)

# Elbow Methods for k-means Algorithm
McDonalds_PC_Numeric_std_PC %>%
  fviz_nbclust(FUN = kmeans, method = 'wss')

# Silhouette Methods for k-means Algorithm
McDonalds_PC_Numeric_std_PC %>%
  fviz_nbclust(FUN = kmeans, method = 'silhouette', verbose = TRUE)

# After evaluating Within Cluster Deviances decreasing, and Silhouette Coefficients we have decided to extract 3 clusters
n_clusters_PCA = 3

# H. Ward's Method for creating cluster labels
res.hclust_PC = hclust(dist(McDonalds_PC_Numeric_std_PC), method = "ward.D")



# Cluster by PC1 and PC2
plot_pc1_pc2_pc = cutree(res.hclust_PC, k = n_clusters_PCA) %>% 
  data.frame() %>% 
  cbind(McDonalds_Menu_PC) %>% 
  ggplot() + 
  geom_point(aes(x = PC1,
                 y = PC2,
                 color = as.factor(.)
  ),
  show.legend = T) +
  geom_text(aes(x = PC1,
                y = PC2,
                color = as.factor(.)), 
            label=McDonalds_Menu_PC$name,
            nudge_x=0.45, nudge_y=0.2,
            check_overlap=T) +
  labs(title = "Clusters (based on PC) by PC1 and PC2 ") +
  geom_hline(yintercept = 0, colour="#999999") +
  geom_vline(xintercept = 0, colour="#999999")


#plot
plot_pc1_pc2_pc

#Red cluster: "Sugary Sweets"
#Green cluster: "Salty/Low Sugar Food"
#Blue cluster: "Low Cal Food and condiments"

#As the number of clusters is only reduced from 4 to 3, and and as this has happened pretty much by uniting clusters 2&4 from the first hierarchical classification into one single cluster (cluster 2 from the second hierarchical clustering), we can conclude that the loss of information associated with reducing the variables to 2 PCs is not significant.