Homework 2- Data Hack

Question 1
- 1.a
- 1.b
- 1.c
Question 2
- 2.a
- 2.b
Question 3
- 3.a
- 3.b
Question 4
- 4.a
- 4.b
- 4.c
Question 5
- 5.a
- 5.b
- 5.c
- 5.d

Load data

library(pacman)
p_load(tidyverse, tidylog, janitor)

ff <- read_csv(
  "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2018/2018-09-04/fastfood_calories.csv"
)

## New names:
## Rows: 515 Columns: 18
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (3): restaurant, item, salad dbl (15): ...1, calories, cal_fat, total_fat,
## sat_fat, trans_fat, cholestero...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

Question 1

1.a

ff <- ff %>% 
  clean_names()

Mcdonalds <- ff %>% 
  filter(restaurant == "Mcdonalds")

## filter: removed 458 rows (89%), 57 rows remaining

summary(Mcdonalds)

##        x1      restaurant            item              calories     
##  Min.   : 1   Length:57          Length:57          Min.   : 140.0  
##  1st Qu.:15   Class :character   Class :character   1st Qu.: 380.0  
##  Median :29   Mode  :character   Mode  :character   Median : 540.0  
##  Mean   :29                                         Mean   : 640.4  
##  3rd Qu.:43                                         3rd Qu.: 740.0  
##  Max.   :57                                         Max.   :2430.0  
##     cal_fat         total_fat         sat_fat         trans_fat     
##  Min.   :  50.0   Min.   :  5.00   Min.   : 0.500   Min.   :0.0000  
##  1st Qu.: 160.0   1st Qu.: 18.00   1st Qu.: 4.500   1st Qu.:0.0000  
##  Median : 240.0   Median : 27.00   Median : 7.000   Median :0.0000  
##  Mean   : 285.6   Mean   : 31.81   Mean   : 8.289   Mean   :0.4649  
##  3rd Qu.: 320.0   3rd Qu.: 36.00   3rd Qu.:11.000   3rd Qu.:1.0000  
##  Max.   :1270.0   Max.   :141.00   Max.   :27.000   Max.   :3.0000  
##   cholesterol        sodium       total_carb         fiber      
##  Min.   :  0.0   Min.   :  20   Min.   :  9.00   Min.   :0.000  
##  1st Qu.: 70.0   1st Qu.: 870   1st Qu.: 32.00   1st Qu.:2.000  
##  Median : 95.0   Median :1120   Median : 46.00   Median :3.000  
##  Mean   :109.7   Mean   :1438   Mean   : 48.79   Mean   :3.228  
##  3rd Qu.:125.0   3rd Qu.:1780   3rd Qu.: 62.00   3rd Qu.:4.000  
##  Max.   :475.0   Max.   :6080   Max.   :156.00   Max.   :8.000  
##      sugar          protein          vit_a            vit_c     
##  Min.   : 0.00   Min.   :  7.0   Min.   :  0.00   Min.   : 0.0  
##  1st Qu.: 4.00   1st Qu.: 25.0   1st Qu.:  2.00   1st Qu.: 2.0  
##  Median : 9.00   Median : 33.0   Median :  6.00   Median :15.0  
##  Mean   :11.07   Mean   : 40.3   Mean   : 33.72   Mean   :18.3  
##  3rd Qu.:13.00   3rd Qu.: 46.0   3rd Qu.: 20.00   3rd Qu.:25.0  
##  Max.   :87.00   Max.   :186.0   Max.   :180.00   Max.   :70.0  
##     calcium         salad          
##  Min.   :  0.0   Length:57         
##  1st Qu.:  6.0   Class :character  
##  Median : 15.0   Mode  :character  
##  Mean   : 20.6                     
##  3rd Qu.: 20.0                     
##  Max.   :290.0

Mean calories is. 640.4 in McDonalds Max is 2430 and min is 140 calories per food item 20 piece Buttermilk Crispy Chicken Tenders is the item with max calories

1.b

high_cal <- ff %>% 
  filter(
    calories > 1000
  )

## filter: removed 487 rows (95%), 28 rows remaining

high_cal %>% 
  count(restaurant, sort= T)

## count: now 6 rows and 2 columns, ungrouped

## # A tibble: 6 × 2
##   restaurant      n
##   <chr>       <int>
## 1 Sonic           8
## 2 Burger King     6
## 3 Mcdonalds       6
## 4 Subway          4
## 5 Dairy Queen     3
## 6 Arbys           1

1.c

sub_chick_1000 <- ff %>% 
  filter(
    restaurant == "Subway"| restaurant == "Chick Fil-A",
    calories < 500
  )

## filter: removed 440 rows (85%), 75 rows remaining

75 items

Question 2

2.a

ff %>% 
  group_by(restaurant) %>% 
  summarise(mean(calories))

## group_by: one grouping variable (restaurant)
## summarise: now 8 rows and 2 columns, ungrouped

## # A tibble: 8 × 2
##   restaurant  `mean(calories)`
##   <chr>                  <dbl>
## 1 Arbys                   533.
## 2 Burger King             609.
## 3 Chick Fil-A             384.
## 4 Dairy Queen             520.
## 5 Mcdonalds               640.
## 6 Sonic                   632.
## 7 Subway                  503.
## 8 Taco Bell               444.

McDonalds

2.b

nutrition_by_restaurant <- ff %>% 
  group_by(restaurant) %>% 
  summarize(
    number_of_items = n(),
    average_calories = mean(calories, na.rm = T),
    average_protein = mean(protein, na.rm = T),
    average_sodium = mean(sodium, na.rm = T),
    average_total_fat = mean(total_fat, na.rm = T),
    .groups = "drop"
  )

## group_by: one grouping variable (restaurant)
## summarize: now 8 rows and 6 columns, ungrouped

Question 3

3.a

ff <- ff %>% 
  group_by(restaurant) %>% 
  mutate(
    restaurant_avg_cal = mean(calories, na.rm = T),
    cal_vs_avg = calories - restaurant_avg_cal
  ) %>% 
  ungroup()

## group_by: one grouping variable (restaurant)
## mutate (grouped): new variable 'restaurant_avg_cal' (double) with 8 unique values and 0% NA
##                   new variable 'cal_vs_avg' (double) with 333 unique values and 0% NA
## ungroup: no grouping variables remain

3.b

ff %>%
  arrange(desc(cal_vs_avg)) %>%
  select(restaurant, item, calories, restaurant_avg_cal, cal_vs_avg) %>%
  slice(1:5)

## select: dropped 15 variables (x1, cal_fat, total_fat, sat_fat, trans_fat, …)
## slice: removed 510 rows (99%), 5 rows remaining

## # A tibble: 5 × 5
##   restaurant  item                        calories restaurant_avg_cal cal_vs_avg
##   <chr>       <chr>                          <dbl>              <dbl>      <dbl>
## 1 Mcdonalds   20 piece Buttermilk Crispy…     2430               640.      1790.
## 2 Mcdonalds   40 piece Chicken McNuggets      1770               640.      1130.
## 3 Mcdonalds   10 piece Sweet N' Spicy Ho…     1600               640.       960.
## 4 Burger King American Brewhouse King         1550               609.       941.
## 5 Mcdonalds   12 piece Buttermilk Crispy…     1510               640.       870.

Question 4

4.a

ff <- ff %>% 
  mutate(
    estimated_cal = (protein*4 + total_carb*4 + total_fat*9),
    cal_discrepancy = calories - estimated_cal
  )

## mutate: new variable 'estimated_cal' (double) with 395 unique values and <1% NA
##         new variable 'cal_discrepancy' (double) with 61 unique values and <1% NA

4.b

ff %>%
  summarize(
    min_discrepancy    = min(cal_discrepancy,    na.rm = TRUE),
    mean_discrepancy   = mean(cal_discrepancy,   na.rm = TRUE),
    median_discrepancy = median(cal_discrepancy, na.rm = TRUE),
    max_discrepancy    = max(cal_discrepancy,    na.rm = TRUE)
  )

## summarize: now one row and 4 columns, ungrouped

## # A tibble: 1 × 4
##   min_discrepancy mean_discrepancy median_discrepancy max_discrepancy
##             <dbl>            <dbl>              <dbl>           <dbl>
## 1            -892            -2.90                 -2             190

library(ggplot2)
ggplot(data = ff, aes(x = cal_discrepancy)) +
  geom_histogram(binwidth = 25,
                  fill = "pink",
                  color = "white") +
  labs(
    title = "Distribution of Calorie Discrepancy",
    x     = "Reported Calories − Estimated Calories",
    y     = "Number of Items"
  )

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

4.c

ff %>%
  arrange(desc(abs(cal_discrepancy))) %>%
  select(restaurant, item, calories, cal_discrepancy) %>%
  slice(1:10)

## select: dropped 18 variables (x1, cal_fat, total_fat, sat_fat, trans_fat, …)
## slice: removed 505 rows (98%), 10 rows remaining

## # A tibble: 10 × 4
##    restaurant  item                                     calories cal_discrepancy
##    <chr>       <chr>                                       <dbl>           <dbl>
##  1 Sonic       Ultimate Chicken Club                         100            -892
##  2 Arbys       Smokehouse Beef Short Rib Sandwich            590            -281
##  3 Burger King American Brewhouse King                      1550            -204
##  4 Subway      Footlong Oven Roasted Chicken                 640             190
##  5 Burger King Bacon King                                   1040             188
##  6 Sonic       Garlic Parmesan Dunked Ultimate Chicken…     1350              82
##  7 Sonic       Buffalo Dunked Ultimate Chicken Sandwich     1000              79
##  8 Dairy Queen Deluxe Double Cheeseburger                    640              58
##  9 Dairy Queen Deluxe Double Hamburger                       540              54
## 10 Dairy Queen Original Double Cheeseburger                  630              52

Question 5

5.a

ggplot(data = ff, aes(x = calories)) +
  geom_histogram(binwidth = 50,
                 fill = "pink",
                 color = "white") +
  labs(
    title = "Distribution of Calorie Across all Fast Food Items",
    x     = "Reported Calories",
    y     = "Number of Items"
  )

5.b

ggplot(ff, aes(x = restaurant, y = calories)) +
  geom_boxplot(fill = "turquoise", alpha = 0.6) +
  coord_flip() +
  labs (
    title = "Calorie Distribution by Restaurant",
    x     = NULL,
    y     = "Calories"
  )

5.c

ggplot(nutrition_by_restaurant,
       aes(x = reorder(restaurant, average_sodium), y = average_sodium)) +
  geom_col(fill = "turquoise") +
  coord_flip() +
  labs(
    title = "Average Sodium per Item by Restaurant",
    x     = NULL,
    y     = "Average Sodium (mg)"
  )

5.d

ggplot(ff, aes(x = total_fat, y = calories, color = restaurant)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE, color = "turquoise") +
  labs(
    title = "Does Fat Drive Calories?",
    x     = "Total Fat (g)",
    y     = "Calories",
    color = "Restaurant"
  )

## `geom_smooth()` using formula = 'y ~ x'

## Question 6 #### Is there a relationship between fiber and calories?

ff %>%
  ggplot(aes(x = fiber, y = calories)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Relationship between Fiber and Calories",
    x = "Fiber (g)",
    y = "Calories"
  )

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

model <- lm(calories ~ fiber, data = ff)
summary(model)

## 
## Call:
## lm(formula = calories ~ fiber, data = ff)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.82 -177.58  -50.25  135.65 1962.13 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  414.919     20.254  20.486  < 2e-16 ***
## fiber         26.476      3.948   6.707 5.38e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 268.7 on 501 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.08239,    Adjusted R-squared:  0.08056 
## F-statistic: 44.98 on 1 and 501 DF,  p-value: 5.382e-11