Preparations

setwd("~/r_learning")

library(readr)
library(forcats)
library(dplyr)
library(corrplot)
library(ggplot2)
library(tidyverse)
library(knitr)
mac <- read.csv("Mcdonald.csv")

Dimension of data

dim(mac)
## [1] 260  24
colnames(mac)
##  [1] "Category"                      "Item"                         
##  [3] "Serving.Size"                  "Calories"                     
##  [5] "Calories.from.Fat"             "Total.Fat"                    
##  [7] "Total.Fat....Daily.Value."     "Saturated.Fat"                
##  [9] "Saturated.Fat....Daily.Value." "Trans.Fat"                    
## [11] "Cholesterol"                   "Cholesterol....Daily.Value."  
## [13] "Sodium"                        "Sodium....Daily.Value."       
## [15] "Carbohydrates"                 "Carbohydrates....Daily.Value."
## [17] "Dietary.Fiber"                 "Dietary.Fiber....Daily.Value."
## [19] "Sugars"                        "Protein"                      
## [21] "Vitamin.A....Daily.Value."     "Vitamin.C....Daily.Value."    
## [23] "Calcium....Daily.Value."       "Iron....Daily.Value."

Plotting food categories having highest and lowest varieties

mac_cat = mac %>% group_by(Category) %>% summarise(No_items = n()) %>% arrange(desc(No_items))
# Food Categories arranged from highest to lowest
print(mac_cat)  
## # A tibble: 9 x 2
##   Category           No_items
##   <fct>                 <int>
## 1 Coffee & Tea             95
## 2 Breakfast                42
## 3 Smoothies & Shakes       28
## 4 Beverages                27
## 5 Chicken & Fish           27
## 6 Beef & Pork              15
## 7 Snacks & Sides           13
## 8 Desserts                  7
## 9 Salads                    6
Variety Type Count
Coffee & Tea Highest 95
Salads Lowest 6

Coffee & Tea marked in Green bar Salads marked in red bar

# Plotting of Food Categories  
ggplot(mac_cat) +
  geom_bar(aes(x=fct_reorder(Category, No_items), y=No_items),stat = "identity") +
  # Category having lowest variety of items marked in red bar
  geom_bar(data=subset(mac_cat, No_items==min(No_items)), aes(Category, No_items),
           fill="red", stat="identity") +
  # Category having highest variety of items marked in green bar
  geom_bar(data=subset(mac_cat, No_items==max(No_items)), aes(Category, No_items),
           fill="green", stat="identity") + 
  labs(title = " Variety in MacD Food Categories " , x = "Food Cateogries", 
       y = "Count of item Varieties") +
       coord_flip()+
      theme_light()

Outlier analysis - Boxplot method

Since there is large number of variables around 24 , outlier analysis cannot be done in one go. So the Variables have been sliced into two parts for sake of ease One just normal elements which do not have “% Daily Value” in their names and other half having them as variable names

For Variables not having “Daily value” in their Names

by_Ingredients = mac %>% select(-contains("Daily.Value"))
by_Ingredients = by_Ingredients %>% gather(key= Elements,value = "value", 
                                           -c(Category,Item,Serving.Size))
colnames(by_Ingredients)
## [1] "Category"     "Item"         "Serving.Size" "Elements"    
## [5] "value"
by_Ingredients$Elements = as.factor(by_Ingredients$Elements)
nlevels(by_Ingredients$Elements)
## [1] 11
ggplot(by_Ingredients, aes(x=Elements, y=log10(value), fill=Elements)) +
  geom_boxplot(alpha=0.3,outlier.colour="darkblue",
               outlier.fill="darkblue",
               outlier.size=3) +
  coord_flip()+
  labs(y= "Percent Daily Value", x= "Elements", 
          title =  " Outlier analysis of Elemental variables",
          caption = "Outliers marked in blue") + 
  theme_minimal()
## Warning: Removed 573 rows containing non-finite values (stat_boxplot).

To capture better log10 values were used. Trans Fat, Sugars and Dietry Fibre have no outliers in first part as it is clear from graph above

For Variables containing % Daily Values as Names

by_DailyValue = mac %>% select(contains("Daily.Value"),Category, Item, Serving.Size)
colnames(by_DailyValue)
##  [1] "Total.Fat....Daily.Value."     "Saturated.Fat....Daily.Value."
##  [3] "Cholesterol....Daily.Value."   "Sodium....Daily.Value."       
##  [5] "Carbohydrates....Daily.Value." "Dietary.Fiber....Daily.Value."
##  [7] "Vitamin.A....Daily.Value."     "Vitamin.C....Daily.Value."    
##  [9] "Calcium....Daily.Value."       "Iron....Daily.Value."         
## [11] "Category"                      "Item"                         
## [13] "Serving.Size"
by_DailyValue = by_DailyValue %>% gather(key = Ing_DailyValue, value="value",  -c(Category,Item,Serving.Size))
by_DailyValue$Ing_DailyValue = as.factor(by_DailyValue$Ing_DailyValue)

by_DailyValue = by_DailyValue %>% mutate(Ing_DailyValue = str_replace(Ing_DailyValue, "....Daily.Value", "DV"))
ggplot(by_DailyValue, aes(x=Ing_DailyValue, y=value, fill=Ing_DailyValue)) +
  geom_boxplot(alpha=0.3,outlier.colour="darkblue",
               outlier.fill="darkblue",
               outlier.size=3) +
  coord_flip()+
  stat_summary(fun.y=median, geom="point", shape=20, size=3, color="red", fill="red")+
  labs(y= "Percent Daily Value", x= "Elemental Ingredients", 
           title =  " Outlier Analysis of Daily Value Variables",
           caption = "Median %age value marked in Red Dot
                      Outliers marked in dark blue") 

Saturated Fat Daily value had no outlier from the second part

So in all 4 variables had no Outliers in the Mac food item Categories

They are

 1. Saturated Fat Daily Value 
 
 2. Sugars
 
 3. Dietry Fibre
 
 4. Trans Fat

Correlation plots

mac1 = mac %>% select_if(is.numeric) %>% cor() %>% corrplot(title="Correlation of Mac Food" , tl.pos = "dt", tl.cex = 0.7,tl.col = "black" , method = "color", type = "full", addgrid.col = "black")

mac1 = mac %>% select_if(is.numeric) %>% cor() %>% corrplot(title="Correlation of Mac Food" , tl.pos = "td", tl.cex = 0.7,tl.col = "black" , method = "number", type = "upper", addgrid.col = "black", addCoef.col="black" ,number.cex = 1, number.font = 2, number.digits = 1)

Two representations of Correlation plots were created

  1. Graphical

  2. Numerical 

Calories was highly correlated positively to

  1. Calories from Fat

  2. Total Fat

  3. Saturated Fat

  4. Sodium

  5. Carbohydrates

  6. Protien

Plot also highlights Sodium <–> Protien <—> Iron too are highly correlated positively

Even Carbohydrates and Sugars show good bonding positively

There is some week bonding of -4 between Sodium , Sugar and Iron

Food Category having maximum Cholestrol % Daily Value

mac_Chol = mac %>% group_by(Category) %>% summarise(Chol.Level = max(Cholesterol....Daily.Value.)) %>% arrange(desc(Chol.Level))
print(mac_Chol)
## # A tibble: 9 x 2
##   Category           Chol.Level
##   <fct>                   <int>
## 1 Breakfast                 192
## 2 Chicken & Fish             89
## 3 Beef & Pork                53
## 4 Coffee & Tea               32
## 5 Smoothies & Shakes         30
## 6 Salads                     29
## 7 Snacks & Sides             16
## 8 Desserts                   10
## 9 Beverages                   3

Breakfast as category contributes the highest Cholestrol (% Daily Value ) in Mac D Food items

Food item having maximum Sodium

by_sodium = mac %>% select(Category, Item, Sodium) %>% arrange(desc(Sodium)) 
print(head (by_sodium, 1))
##         Category                         Item Sodium
## 1 Chicken & Fish Chicken McNuggets (40 piece)   3600

Chicken McNuggets have the highest Sodium content of 3600 marked by Red dot in the plot

ggplot(by_sodium,aes(x=Category , y = Sodium)) +
  geom_point(aes(color = ifelse(Sodium >= max(Sodium), T, F)), size = 4, shape = 20) +
  scale_color_manual(values = c('blue', 'red')) + 
  labs(color = " Max Sodium", y = "Sodium Content", title = "Sodium Contamination - Mac D Foods") + 
  coord_flip()+
  theme_gray()

Top 4 Food items having maximum Saturated Fat

mac_fat = mac %>% arrange(desc(Saturated.Fat)) %>% select(Item, Saturated.Fat) %>% top_n(4, Saturated.Fat)
print(mac_fat)
##                                          Item Saturated.Fat
## 1 Big Breakfast with Hotcakes (Large Biscuit)            20
## 2                Chicken McNuggets (40 piece)            20
## 3              Frappé Chocolate Chip (Large)            20
## 4      McFlurry with M&Mâ\200\231s Candies (Medium)            20

Top for Food items Contributing highest amounts of Saturated Fat are

 1. Big Breakfast with Hotcakes (Large Biscuit)     

 2. Chicken McNuggets (40 piece)

 3. Frappé Chocolate Chip (Large)      

 4. McFlurry with M&Mâ\200\231s Candies (Medium)