setwd("~/r_learning")
library(readr)
library(forcats)
library(dplyr)
library(corrplot)
library(ggplot2)
library(tidyverse)
library(knitr)
mac <- read.csv("Mcdonald.csv")
dim(mac)
## [1] 260 24
colnames(mac)
## [1] "Category" "Item"
## [3] "Serving.Size" "Calories"
## [5] "Calories.from.Fat" "Total.Fat"
## [7] "Total.Fat....Daily.Value." "Saturated.Fat"
## [9] "Saturated.Fat....Daily.Value." "Trans.Fat"
## [11] "Cholesterol" "Cholesterol....Daily.Value."
## [13] "Sodium" "Sodium....Daily.Value."
## [15] "Carbohydrates" "Carbohydrates....Daily.Value."
## [17] "Dietary.Fiber" "Dietary.Fiber....Daily.Value."
## [19] "Sugars" "Protein"
## [21] "Vitamin.A....Daily.Value." "Vitamin.C....Daily.Value."
## [23] "Calcium....Daily.Value." "Iron....Daily.Value."
mac_cat = mac %>% group_by(Category) %>% summarise(No_items = n()) %>% arrange(desc(No_items))
# Food Categories arranged from highest to lowest
print(mac_cat)
## # A tibble: 9 x 2
## Category No_items
## <fct> <int>
## 1 Coffee & Tea 95
## 2 Breakfast 42
## 3 Smoothies & Shakes 28
## 4 Beverages 27
## 5 Chicken & Fish 27
## 6 Beef & Pork 15
## 7 Snacks & Sides 13
## 8 Desserts 7
## 9 Salads 6
| Variety | Type | Count |
|---|---|---|
| Coffee & Tea | Highest | 95 |
| Salads | Lowest | 6 |
Coffee & Tea marked in Green bar Salads marked in red bar
# Plotting of Food Categories
ggplot(mac_cat) +
geom_bar(aes(x=fct_reorder(Category, No_items), y=No_items),stat = "identity") +
# Category having lowest variety of items marked in red bar
geom_bar(data=subset(mac_cat, No_items==min(No_items)), aes(Category, No_items),
fill="red", stat="identity") +
# Category having highest variety of items marked in green bar
geom_bar(data=subset(mac_cat, No_items==max(No_items)), aes(Category, No_items),
fill="green", stat="identity") +
labs(title = " Variety in MacD Food Categories " , x = "Food Cateogries",
y = "Count of item Varieties") +
coord_flip()+
theme_light()
Since there is large number of variables around 24 , outlier analysis cannot be done in one go. So the Variables have been sliced into two parts for sake of ease One just normal elements which do not have “% Daily Value” in their names and other half having them as variable names
by_Ingredients = mac %>% select(-contains("Daily.Value"))
by_Ingredients = by_Ingredients %>% gather(key= Elements,value = "value",
-c(Category,Item,Serving.Size))
colnames(by_Ingredients)
## [1] "Category" "Item" "Serving.Size" "Elements"
## [5] "value"
by_Ingredients$Elements = as.factor(by_Ingredients$Elements)
nlevels(by_Ingredients$Elements)
## [1] 11
ggplot(by_Ingredients, aes(x=Elements, y=log10(value), fill=Elements)) +
geom_boxplot(alpha=0.3,outlier.colour="darkblue",
outlier.fill="darkblue",
outlier.size=3) +
coord_flip()+
labs(y= "Percent Daily Value", x= "Elements",
title = " Outlier analysis of Elemental variables",
caption = "Outliers marked in blue") +
theme_minimal()
## Warning: Removed 573 rows containing non-finite values (stat_boxplot).
To capture better log10 values were used. Trans Fat, Sugars and Dietry Fibre have no outliers in first part as it is clear from graph above
by_DailyValue = mac %>% select(contains("Daily.Value"),Category, Item, Serving.Size)
colnames(by_DailyValue)
## [1] "Total.Fat....Daily.Value." "Saturated.Fat....Daily.Value."
## [3] "Cholesterol....Daily.Value." "Sodium....Daily.Value."
## [5] "Carbohydrates....Daily.Value." "Dietary.Fiber....Daily.Value."
## [7] "Vitamin.A....Daily.Value." "Vitamin.C....Daily.Value."
## [9] "Calcium....Daily.Value." "Iron....Daily.Value."
## [11] "Category" "Item"
## [13] "Serving.Size"
by_DailyValue = by_DailyValue %>% gather(key = Ing_DailyValue, value="value", -c(Category,Item,Serving.Size))
by_DailyValue$Ing_DailyValue = as.factor(by_DailyValue$Ing_DailyValue)
by_DailyValue = by_DailyValue %>% mutate(Ing_DailyValue = str_replace(Ing_DailyValue, "....Daily.Value", "DV"))
ggplot(by_DailyValue, aes(x=Ing_DailyValue, y=value, fill=Ing_DailyValue)) +
geom_boxplot(alpha=0.3,outlier.colour="darkblue",
outlier.fill="darkblue",
outlier.size=3) +
coord_flip()+
stat_summary(fun.y=median, geom="point", shape=20, size=3, color="red", fill="red")+
labs(y= "Percent Daily Value", x= "Elemental Ingredients",
title = " Outlier Analysis of Daily Value Variables",
caption = "Median %age value marked in Red Dot
Outliers marked in dark blue")
Saturated Fat Daily value had no outlier from the second part
So in all 4 variables had no Outliers in the Mac food item Categories
They are
1. Saturated Fat Daily Value
2. Sugars
3. Dietry Fibre
4. Trans Fat
mac1 = mac %>% select_if(is.numeric) %>% cor() %>% corrplot(title="Correlation of Mac Food" , tl.pos = "dt", tl.cex = 0.7,tl.col = "black" , method = "color", type = "full", addgrid.col = "black")
mac1 = mac %>% select_if(is.numeric) %>% cor() %>% corrplot(title="Correlation of Mac Food" , tl.pos = "td", tl.cex = 0.7,tl.col = "black" , method = "number", type = "upper", addgrid.col = "black", addCoef.col="black" ,number.cex = 1, number.font = 2, number.digits = 1)
Two representations of Correlation plots were created
1. Graphical
2. Numerical
Calories was highly correlated positively to
Calories from Fat
Total Fat
Saturated Fat
Sodium
Carbohydrates
Protien
Plot also highlights Sodium <–> Protien <—> Iron too are highly correlated positively
Even Carbohydrates and Sugars show good bonding positively
There is some week bonding of -4 between Sodium , Sugar and Iron
mac_Chol = mac %>% group_by(Category) %>% summarise(Chol.Level = max(Cholesterol....Daily.Value.)) %>% arrange(desc(Chol.Level))
print(mac_Chol)
## # A tibble: 9 x 2
## Category Chol.Level
## <fct> <int>
## 1 Breakfast 192
## 2 Chicken & Fish 89
## 3 Beef & Pork 53
## 4 Coffee & Tea 32
## 5 Smoothies & Shakes 30
## 6 Salads 29
## 7 Snacks & Sides 16
## 8 Desserts 10
## 9 Beverages 3
Breakfast as category contributes the highest Cholestrol (% Daily Value ) in Mac D Food items
by_sodium = mac %>% select(Category, Item, Sodium) %>% arrange(desc(Sodium))
print(head (by_sodium, 1))
## Category Item Sodium
## 1 Chicken & Fish Chicken McNuggets (40 piece) 3600
Chicken McNuggets have the highest Sodium content of 3600 marked by Red dot in the plot
ggplot(by_sodium,aes(x=Category , y = Sodium)) +
geom_point(aes(color = ifelse(Sodium >= max(Sodium), T, F)), size = 4, shape = 20) +
scale_color_manual(values = c('blue', 'red')) +
labs(color = " Max Sodium", y = "Sodium Content", title = "Sodium Contamination - Mac D Foods") +
coord_flip()+
theme_gray()
mac_fat = mac %>% arrange(desc(Saturated.Fat)) %>% select(Item, Saturated.Fat) %>% top_n(4, Saturated.Fat)
print(mac_fat)
## Item Saturated.Fat
## 1 Big Breakfast with Hotcakes (Large Biscuit) 20
## 2 Chicken McNuggets (40 piece) 20
## 3 Frappé Chocolate Chip (Large) 20
## 4 McFlurry with M&Mâ\200\231s Candies (Medium) 20
Top for Food items Contributing highest amounts of Saturated Fat are
1. Big Breakfast with Hotcakes (Large Biscuit)
2. Chicken McNuggets (40 piece)
3. Frappé Chocolate Chip (Large)
4. McFlurry with M&Mâ\200\231s Candies (Medium)