library(pacman)
p_load(tidyverse, tidylog, janitor)
ff <- read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2018/2018-09-04/fastfood_calories.csv"
)
## New names:
## Rows: 515 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): restaurant, item, salad dbl (15): ...1, calories, cal_fat, total_fat,
## sat_fat, trans_fat, cholestero...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
ff <- ff %>%
clean_names()
Mcdonalds <- ff %>%
filter(restaurant == "Mcdonalds")
## filter: removed 458 rows (89%), 57 rows remaining
summary(Mcdonalds)
## x1 restaurant item calories
## Min. : 1 Length:57 Length:57 Min. : 140.0
## 1st Qu.:15 Class :character Class :character 1st Qu.: 380.0
## Median :29 Mode :character Mode :character Median : 540.0
## Mean :29 Mean : 640.4
## 3rd Qu.:43 3rd Qu.: 740.0
## Max. :57 Max. :2430.0
## cal_fat total_fat sat_fat trans_fat
## Min. : 50.0 Min. : 5.00 Min. : 0.500 Min. :0.0000
## 1st Qu.: 160.0 1st Qu.: 18.00 1st Qu.: 4.500 1st Qu.:0.0000
## Median : 240.0 Median : 27.00 Median : 7.000 Median :0.0000
## Mean : 285.6 Mean : 31.81 Mean : 8.289 Mean :0.4649
## 3rd Qu.: 320.0 3rd Qu.: 36.00 3rd Qu.:11.000 3rd Qu.:1.0000
## Max. :1270.0 Max. :141.00 Max. :27.000 Max. :3.0000
## cholesterol sodium total_carb fiber
## Min. : 0.0 Min. : 20 Min. : 9.00 Min. :0.000
## 1st Qu.: 70.0 1st Qu.: 870 1st Qu.: 32.00 1st Qu.:2.000
## Median : 95.0 Median :1120 Median : 46.00 Median :3.000
## Mean :109.7 Mean :1438 Mean : 48.79 Mean :3.228
## 3rd Qu.:125.0 3rd Qu.:1780 3rd Qu.: 62.00 3rd Qu.:4.000
## Max. :475.0 Max. :6080 Max. :156.00 Max. :8.000
## sugar protein vit_a vit_c
## Min. : 0.00 Min. : 7.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 4.00 1st Qu.: 25.0 1st Qu.: 2.00 1st Qu.: 2.0
## Median : 9.00 Median : 33.0 Median : 6.00 Median :15.0
## Mean :11.07 Mean : 40.3 Mean : 33.72 Mean :18.3
## 3rd Qu.:13.00 3rd Qu.: 46.0 3rd Qu.: 20.00 3rd Qu.:25.0
## Max. :87.00 Max. :186.0 Max. :180.00 Max. :70.0
## calcium salad
## Min. : 0.0 Length:57
## 1st Qu.: 6.0 Class :character
## Median : 15.0 Mode :character
## Mean : 20.6
## 3rd Qu.: 20.0
## Max. :290.0
Mean calories is. 640.4 in McDonalds Max is 2430 and min is 140 calories per food item 20 piece Buttermilk Crispy Chicken Tenders is the item with max calories
high_cal <- ff %>%
filter(
calories > 1000
)
## filter: removed 487 rows (95%), 28 rows remaining
high_cal %>%
count(restaurant, sort= T)
## count: now 6 rows and 2 columns, ungrouped
## # A tibble: 6 × 2
## restaurant n
## <chr> <int>
## 1 Sonic 8
## 2 Burger King 6
## 3 Mcdonalds 6
## 4 Subway 4
## 5 Dairy Queen 3
## 6 Arbys 1
sub_chick_1000 <- ff %>%
filter(
restaurant == "Subway"| restaurant == "Chick Fil-A",
calories < 500
)
## filter: removed 440 rows (85%), 75 rows remaining
75 items
ff %>%
group_by(restaurant) %>%
summarise(mean(calories))
## group_by: one grouping variable (restaurant)
## summarise: now 8 rows and 2 columns, ungrouped
## # A tibble: 8 × 2
## restaurant `mean(calories)`
## <chr> <dbl>
## 1 Arbys 533.
## 2 Burger King 609.
## 3 Chick Fil-A 384.
## 4 Dairy Queen 520.
## 5 Mcdonalds 640.
## 6 Sonic 632.
## 7 Subway 503.
## 8 Taco Bell 444.
McDonalds
nutrition_by_restaurant <- ff %>%
group_by(restaurant) %>%
summarize(
number_of_items = n(),
average_calories = mean(calories, na.rm = T),
average_protein = mean(protein, na.rm = T),
average_sodium = mean(sodium, na.rm = T),
average_total_fat = mean(total_fat, na.rm = T),
.groups = "drop"
)
## group_by: one grouping variable (restaurant)
## summarize: now 8 rows and 6 columns, ungrouped
ff <- ff %>%
group_by(restaurant) %>%
mutate(
restaurant_avg_cal = mean(calories, na.rm = T),
cal_vs_avg = calories - restaurant_avg_cal
) %>%
ungroup()
## group_by: one grouping variable (restaurant)
## mutate (grouped): new variable 'restaurant_avg_cal' (double) with 8 unique values and 0% NA
## new variable 'cal_vs_avg' (double) with 333 unique values and 0% NA
## ungroup: no grouping variables remain
ff %>%
arrange(desc(cal_vs_avg)) %>%
select(restaurant, item, calories, restaurant_avg_cal, cal_vs_avg) %>%
slice(1:5)
## select: dropped 15 variables (x1, cal_fat, total_fat, sat_fat, trans_fat, …)
## slice: removed 510 rows (99%), 5 rows remaining
## # A tibble: 5 × 5
## restaurant item calories restaurant_avg_cal cal_vs_avg
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Mcdonalds 20 piece Buttermilk Crispy… 2430 640. 1790.
## 2 Mcdonalds 40 piece Chicken McNuggets 1770 640. 1130.
## 3 Mcdonalds 10 piece Sweet N' Spicy Ho… 1600 640. 960.
## 4 Burger King American Brewhouse King 1550 609. 941.
## 5 Mcdonalds 12 piece Buttermilk Crispy… 1510 640. 870.
ff <- ff %>%
mutate(
estimated_cal = (protein*4 + total_carb*4 + total_fat*9),
cal_discrepancy = calories - estimated_cal
)
## mutate: new variable 'estimated_cal' (double) with 395 unique values and <1% NA
## new variable 'cal_discrepancy' (double) with 61 unique values and <1% NA
ff %>%
summarize(
min_discrepancy = min(cal_discrepancy, na.rm = TRUE),
mean_discrepancy = mean(cal_discrepancy, na.rm = TRUE),
median_discrepancy = median(cal_discrepancy, na.rm = TRUE),
max_discrepancy = max(cal_discrepancy, na.rm = TRUE)
)
## summarize: now one row and 4 columns, ungrouped
## # A tibble: 1 × 4
## min_discrepancy mean_discrepancy median_discrepancy max_discrepancy
## <dbl> <dbl> <dbl> <dbl>
## 1 -892 -2.90 -2 190
library(ggplot2)
ggplot(data = ff, aes(x = cal_discrepancy)) +
geom_histogram(binwidth = 25,
fill = "pink",
color = "white") +
labs(
title = "Distribution of Calorie Discrepancy",
x = "Reported Calories − Estimated Calories",
y = "Number of Items"
)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
ff %>%
arrange(desc(abs(cal_discrepancy))) %>%
select(restaurant, item, calories, cal_discrepancy) %>%
slice(1:10)
## select: dropped 18 variables (x1, cal_fat, total_fat, sat_fat, trans_fat, …)
## slice: removed 505 rows (98%), 10 rows remaining
## # A tibble: 10 × 4
## restaurant item calories cal_discrepancy
## <chr> <chr> <dbl> <dbl>
## 1 Sonic Ultimate Chicken Club 100 -892
## 2 Arbys Smokehouse Beef Short Rib Sandwich 590 -281
## 3 Burger King American Brewhouse King 1550 -204
## 4 Subway Footlong Oven Roasted Chicken 640 190
## 5 Burger King Bacon King 1040 188
## 6 Sonic Garlic Parmesan Dunked Ultimate Chicken… 1350 82
## 7 Sonic Buffalo Dunked Ultimate Chicken Sandwich 1000 79
## 8 Dairy Queen Deluxe Double Cheeseburger 640 58
## 9 Dairy Queen Deluxe Double Hamburger 540 54
## 10 Dairy Queen Original Double Cheeseburger 630 52
ggplot(data = ff, aes(x = calories)) +
geom_histogram(binwidth = 50,
fill = "pink",
color = "white") +
labs(
title = "Distribution of Calorie Across all Fast Food Items",
x = "Reported Calories",
y = "Number of Items"
)
ggplot(ff, aes(x = restaurant, y = calories)) +
geom_boxplot(fill = "turquoise", alpha = 0.6) +
coord_flip() +
labs (
title = "Calorie Distribution by Restaurant",
x = NULL,
y = "Calories"
)
ggplot(nutrition_by_restaurant,
aes(x = reorder(restaurant, average_sodium), y = average_sodium)) +
geom_col(fill = "turquoise") +
coord_flip() +
labs(
title = "Average Sodium per Item by Restaurant",
x = NULL,
y = "Average Sodium (mg)"
)
ggplot(ff, aes(x = total_fat, y = calories, color = restaurant)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "turquoise") +
labs(
title = "Does Fat Drive Calories?",
x = "Total Fat (g)",
y = "Calories",
color = "Restaurant"
)
## `geom_smooth()` using formula = 'y ~ x'
## Question 6 #### Is there a relationship between
fiber and calories?
ff %>%
ggplot(aes(x = fiber, y = calories)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Relationship between Fiber and Calories",
x = "Fiber (g)",
y = "Calories"
)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
model <- lm(calories ~ fiber, data = ff)
summary(model)
##
## Call:
## lm(formula = calories ~ fiber, data = ff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -470.82 -177.58 -50.25 135.65 1962.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 414.919 20.254 20.486 < 2e-16 ***
## fiber 26.476 3.948 6.707 5.38e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 268.7 on 501 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.08239, Adjusted R-squared: 0.08056
## F-statistic: 44.98 on 1 and 501 DF, p-value: 5.382e-11