Install packages

library(tidyverse)
library(readr)
library(dplyr)
library(janitor)
library(plotly)
library(forcats)
library(plyr)
library(scales)

Ask

The purpose of my analysis is to find out how healthy are the cereals listed.

    1. Find out which cereal is the healthiest and why?
    1. How does the healthiest cereal match up to the recommended daily intakes proved by the World Health Organization(WHO)?

I’m the primary stack-holder for this project.

Prepare

This dataset was provided by Chris Crawford on Kaggle. Latest update was 10-24-2017. Data was downloaded as a .CSV and stored safely on an external hard drive.

References

cereal <- read.csv("cereal.csv")
food_group <- read.csv("high_potassium_food.csv")

Process

Used glimpse to find out we have 77 rows, 16 columns,the datatypes involved, and make sure we have no duplicates

glimpse(cereal)
## Rows: 77
## Columns: 16
## $ name     <chr> "100% Bran", "100% Natural Bran", "All-Bran", "All-Bran with ~
## $ mfr      <chr> "N", "Q", "K", "K", "R", "G", "K", "G", "R", "P", "Q", "G", "~
## $ type     <chr> "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "~
## $ calories <int> 70, 120, 70, 50, 110, 110, 110, 130, 90, 90, 120, 110, 120, 1~
## $ protein  <int> 4, 3, 4, 4, 2, 2, 2, 3, 2, 3, 1, 6, 1, 3, 1, 2, 2, 1, 1, 3, 3~
## $ fat      <int> 1, 5, 1, 0, 2, 2, 0, 2, 1, 0, 2, 2, 3, 2, 1, 0, 0, 0, 1, 3, 0~
## $ sodium   <int> 130, 15, 260, 140, 200, 180, 125, 210, 200, 210, 220, 290, 21~
## $ fiber    <dbl> 10.0, 2.0, 9.0, 14.0, 1.0, 1.5, 1.0, 2.0, 4.0, 5.0, 0.0, 2.0,~
## $ carbo    <dbl> 5.0, 8.0, 7.0, 8.0, 14.0, 10.5, 11.0, 18.0, 15.0, 13.0, 12.0,~
## $ sugars   <int> 6, 8, 5, 0, 8, 10, 14, 8, 6, 5, 12, 1, 9, 7, 13, 3, 2, 12, 13~
## $ potass   <int> 280, 135, 320, 330, -1, 70, 30, 100, 125, 190, 35, 105, 45, 1~
## $ vitamins <int> 25, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25~
## $ shelf    <int> 3, 3, 3, 3, 3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 2, 1, 1, 2, 2, 3, 2~
## $ weight   <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.33, 1.00, 1.00, 1~
## $ cups     <dbl> 0.33, 1.00, 0.33, 0.50, 0.75, 0.75, 1.00, 0.75, 0.67, 0.67, 0~
## $ rating   <dbl> 68.40297, 33.98368, 59.42551, 93.70491, 34.38484, 29.50954, 3~
n_distinct(cereal)
## [1] 77
summary(cereal)
##      name               mfr                type              calories    
##  Length:77          Length:77          Length:77          Min.   : 50.0  
##  Class :character   Class :character   Class :character   1st Qu.:100.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :110.0  
##                                                           Mean   :106.9  
##                                                           3rd Qu.:110.0  
##                                                           Max.   :160.0  
##     protein           fat            sodium          fiber       
##  Min.   :1.000   Min.   :0.000   Min.   :  0.0   Min.   : 0.000  
##  1st Qu.:2.000   1st Qu.:0.000   1st Qu.:130.0   1st Qu.: 1.000  
##  Median :3.000   Median :1.000   Median :180.0   Median : 2.000  
##  Mean   :2.545   Mean   :1.013   Mean   :159.7   Mean   : 2.152  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:210.0   3rd Qu.: 3.000  
##  Max.   :6.000   Max.   :5.000   Max.   :320.0   Max.   :14.000  
##      carbo          sugars           potass          vitamins     
##  Min.   :-1.0   Min.   :-1.000   Min.   : -1.00   Min.   :  0.00  
##  1st Qu.:12.0   1st Qu.: 3.000   1st Qu.: 40.00   1st Qu.: 25.00  
##  Median :14.0   Median : 7.000   Median : 90.00   Median : 25.00  
##  Mean   :14.6   Mean   : 6.922   Mean   : 96.08   Mean   : 28.25  
##  3rd Qu.:17.0   3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00  
##  Max.   :23.0   Max.   :15.000   Max.   :330.00   Max.   :100.00  
##      shelf           weight          cups           rating     
##  Min.   :1.000   Min.   :0.50   Min.   :0.250   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:1.00   1st Qu.:0.670   1st Qu.:33.17  
##  Median :2.000   Median :1.00   Median :0.750   Median :40.40  
##  Mean   :2.208   Mean   :1.03   Mean   :0.821   Mean   :42.67  
##  3rd Qu.:3.000   3rd Qu.:1.00   3rd Qu.:1.000   3rd Qu.:50.83  
##  Max.   :3.000   Max.   :1.50   Max.   :1.500   Max.   :93.70

Removed the rows that have negative values. Which brought me to 74 rows and 16 columns after removing the rows with negative values.

cereal_without_neg <- subset(cereal,carbo != -1 & potass != -1)
glimpse(cereal_without_neg)
## Rows: 74
## Columns: 16
## $ name     <chr> "100% Bran", "100% Natural Bran", "All-Bran", "All-Bran with ~
## $ mfr      <chr> "N", "Q", "K", "K", "G", "K", "G", "R", "P", "Q", "G", "G", "~
## $ type     <chr> "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "C", "~
## $ calories <int> 70, 120, 70, 50, 110, 110, 130, 90, 90, 120, 110, 120, 110, 1~
## $ protein  <int> 4, 3, 4, 4, 2, 2, 3, 2, 3, 1, 6, 1, 3, 1, 2, 2, 1, 1, 3, 2, 2~
## $ fat      <int> 1, 5, 1, 0, 2, 0, 2, 1, 0, 2, 2, 3, 2, 1, 0, 0, 0, 1, 3, 0, 1~
## $ sodium   <int> 130, 15, 260, 140, 180, 125, 210, 200, 210, 220, 290, 210, 14~
## $ fiber    <dbl> 10.0, 2.0, 9.0, 14.0, 1.5, 1.0, 2.0, 4.0, 5.0, 0.0, 2.0, 0.0,~
## $ carbo    <dbl> 5.0, 8.0, 7.0, 8.0, 10.5, 11.0, 18.0, 15.0, 13.0, 12.0, 17.0,~
## $ sugars   <int> 6, 8, 5, 0, 10, 14, 8, 6, 5, 12, 1, 9, 7, 13, 3, 2, 12, 13, 7~
## $ potass   <int> 280, 135, 320, 330, 70, 30, 100, 125, 190, 35, 105, 45, 105, ~
## $ vitamins <int> 25, 0, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25~
## $ shelf    <int> 3, 3, 3, 3, 1, 2, 3, 1, 3, 2, 1, 2, 3, 2, 1, 1, 2, 2, 3, 3, 3~
## $ weight   <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.33, 1.00, 1.00, 1.00, 1~
## $ cups     <dbl> 0.33, 1.00, 0.33, 0.50, 0.75, 1.00, 0.75, 0.67, 0.67, 0.75, 1~
## $ rating   <dbl> 68.40297, 33.98368, 59.42551, 93.70491, 29.50954, 33.17409, 3~
summary(cereal_without_neg)
##      name               mfr                type              calories  
##  Length:74          Length:74          Length:74          Min.   : 50  
##  Class :character   Class :character   Class :character   1st Qu.:100  
##  Mode  :character   Mode  :character   Mode  :character   Median :110  
##                                                           Mean   :107  
##                                                           3rd Qu.:110  
##                                                           Max.   :160  
##     protein           fat        sodium          fiber            carbo      
##  Min.   :1.000   Min.   :0   Min.   :  0.0   Min.   : 0.000   Min.   : 5.00  
##  1st Qu.:2.000   1st Qu.:0   1st Qu.:135.0   1st Qu.: 0.250   1st Qu.:12.00  
##  Median :2.500   Median :1   Median :180.0   Median : 2.000   Median :14.50  
##  Mean   :2.514   Mean   :1   Mean   :162.4   Mean   : 2.176   Mean   :14.73  
##  3rd Qu.:3.000   3rd Qu.:1   3rd Qu.:217.5   3rd Qu.: 3.000   3rd Qu.:17.00  
##  Max.   :6.000   Max.   :5   Max.   :320.0   Max.   :14.000   Max.   :23.00  
##      sugars           potass          vitamins          shelf      
##  Min.   : 0.000   Min.   : 15.00   Min.   :  0.00   Min.   :1.000  
##  1st Qu.: 3.000   1st Qu.: 41.25   1st Qu.: 25.00   1st Qu.:1.250  
##  Median : 7.000   Median : 90.00   Median : 25.00   Median :2.000  
##  Mean   : 7.108   Mean   : 98.51   Mean   : 29.05   Mean   :2.216  
##  3rd Qu.:11.000   3rd Qu.:120.00   3rd Qu.: 25.00   3rd Qu.:3.000  
##  Max.   :15.000   Max.   :330.00   Max.   :100.00   Max.   :3.000  
##      weight           cups            rating     
##  Min.   :0.500   Min.   :0.2500   Min.   :18.04  
##  1st Qu.:1.000   1st Qu.:0.6700   1st Qu.:32.45  
##  Median :1.000   Median :0.7500   Median :40.25  
##  Mean   :1.031   Mean   :0.8216   Mean   :42.37  
##  3rd Qu.:1.000   3rd Qu.:1.0000   3rd Qu.:50.52  
##  Max.   :1.500   Max.   :1.5000   Max.   :93.70

Arranged the cereal by rating in descending order

cereal_by_rating <- cereal_without_neg %>% arrange(desc(rating))
food_group <-clean_names(food_group)

Analyze and Visualize

As seen below, the higher rating cereals have a lower calorie count.

rating_calories <- ggplot(data = cereal_by_rating) + geom_jitter(aes(x= calories, y = rating, fill = name)) +
  labs(title = "80 Cereals", subtitle = "Calorie vs Rating trend", x = "Calories", y= "Rating", caption = "Data Source: Chris Crawford")
  rating_calorie <- ggplotly(rating_calories)
rating_calorie

The World Health Organization(WHO) recommends a reduction in sodium intake to reduce blood pressure and risk of cardiovascular disease,stroke and coronary heart disease in adults. WHO recommends a reduction to <2g/day sodium(5g/day salt) in adults. Adults include individuals > or = 16 years of age. I decided to compare this recommendation to the highest rated cereal on our list to see if this would be a good product to eat.I also converted the sodium of the cereal to grams.

cereal_max_rating <- cereal_by_rating [1,]
cereal_max_rating["sodium_in_grams"] <- cereal_max_rating$sodium / 1000
cereal_max <- select(cereal_max_rating, name, sodium_in_grams)
recommended_sodium <- c("WHO_recommendation", 7)
sodium_intake <- rbind(cereal_max, recommended_sodium)
sodium_intake$sodium_in_grams <- as.double(sodium_intake$sodium_in_grams)
ggplot(data = sodium_intake) + geom_col(aes(x = name, y = sodium_in_grams, fill = name)) +
      labs(title = "Sodium", x= "", y = "Sodium (g)", subtitle = "All-Bran with Extra Fiber vs WHO recommendation for daily sodium intake 3/27/2022", caption = "Data Source: Chris Crawford") +
      geom_text(aes(x= name, y = sodium_in_grams, label = sodium_in_grams, vjust=-.1))

The meta-analysis of 21 studies with 21 comparisons found that increased potassium resulted in a decrease in resting systolic blood pressure. The results suggest that the greatest impact on blood pressure was achieved when the increased potassium intake was approximately 3,519–4,692mg/day. Let’s see how our close the top rated cereal compares to the WHO recommendation.

cereal_by_potassium <- cereal_by_rating %>% arrange(desc(potass))
potass_top_5 <- head(cereal_by_potassium,5)
ggplot(data = potass_top_5) + geom_col(mapping = aes(x = name, y = potass, fill = name)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=.4)) +
  labs(x = " ",y = "Potassium (mg)", title = "Potassium", subtitle = "Top Five brands with the highest amount of Potassium", caption = "Data Source: Chris Crawford") +
  geom_text(aes(x= name, y = potass, label = potass, vjust=-.1))

cereal_max_potass <- select(cereal_max_rating, name, potass)
recommended_potassium <- c("WHO_recommendation", 3519)
potass_intake <- rbind(cereal_max_potass, recommended_potassium)
potass_intake$potass <- as.double(potass_intake$potass)
ggplot(data = potass_intake) + geom_col(aes(x = name, y = potass, fill = name)) +
  labs(title = "Potassium", x= "", y = "Potassium (mg)", subtitle = "All-Bran with Extra Fiber vs WHO recommendation for daily potassium intake 3/27/2022", caption = "Data Source: Chris Crawford") +
  geom_text(aes(x= name, y = potass, label = potass, vjust=-.1))

Pros

  • Low calories
  • Contains some potassium
  • Low in sodium

Cons

  • Isn’t close to filling the daily recommendation of potassium

Conclusion

The highest rated cereal may be good for weight loss due to it’s low calorie and sodium, but if you plan to eat this cereal I suggest eating foods rich in potassium along with it. Examples:

##         food_group potassium_content_mg
## 1   Beans and peas                 1300
## 2             Nuts                  600
## 3 Green vegetables                  550
## 4  Root vegetables                  200
## 5 Other vegetables                  300
## 6           Fruits                  300