This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.2
## Warning: package 'stringr' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Warning: package 'openintro' was built under R version 4.1.2
## Loading required package: airports
## Warning: package 'airports' was built under R version 4.1.2
## Loading required package: cherryblossom
## Warning: package 'cherryblossom' was built under R version 4.1.2
## Loading required package: usdata
## Warning: package 'usdata' was built under R version 4.1.2
data("fastfood", package='openintro')
head(fastfood)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Artisan G~ 380 60 7 2 0 95
## 2 Mcdonalds Single Ba~ 840 410 45 17 1.5 130
## 3 Mcdonalds Double Ba~ 1130 600 67 27 3 220
## 4 Mcdonalds Grilled B~ 750 280 31 10 0.5 155
## 5 Mcdonalds Crispy Ba~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big Mac 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
mcdonalds
## # A tibble: 57 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Artisan ~ 380 60 7 2 0 95
## 2 Mcdonalds Single B~ 840 410 45 17 1.5 130
## 3 Mcdonalds Double B~ 1130 600 67 27 3 220
## 4 Mcdonalds Grilled ~ 750 280 31 10 0.5 155
## 5 Mcdonalds Crispy B~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big Mac 540 250 28 10 1 80
## 7 Mcdonalds Cheesebu~ 300 100 12 5 0.5 40
## 8 Mcdonalds Classic ~ 510 210 24 4 0 65
## 9 Mcdonalds Double C~ 430 190 21 11 1 85
## 10 Mcdonalds Double Q~ 770 400 45 21 2.5 175
## # ... with 47 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
dairy_queen
## # A tibble: 42 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dairy Queen 1/2 lb.~ 1000 660 74 26 2 170
## 2 Dairy Queen 1/2 lb.~ 800 460 51 20 2 135
## 3 Dairy Queen 1/4 lb.~ 630 330 37 13 1 95
## 4 Dairy Queen 1/4 lb.~ 540 270 30 11 1 70
## 5 Dairy Queen 1/4 lb.~ 570 310 35 11 1 75
## 6 Dairy Queen Origina~ 400 160 18 9 1 65
## 7 Dairy Queen Origina~ 630 310 34 18 2 125
## 8 Dairy Queen 4 Piece~ 1030 480 53 9 1 80
## 9 Dairy Queen 6 Piece~ 1260 590 66 11 1 120
## 10 Dairy Queen Bacon C~ 420 240 26 11 1 60
## # ... with 32 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
mcdonalds %>%
ggplot() +
geom_histogram(aes(x = cal_fat), bins = 5) +
ggtitle("Distribution of McDonalds Fat Calories") +
xlab("Calories from Fat") +
ylab("Frequency")
dairy_queen %>%
ggplot() +
geom_histogram(aes(x = cal_fat), bins = 5) +
ggtitle("Distribution of Dairy Queen Fat Calories") +
xlab("Calories from Fat") +
ylab("Frequency")
# Calculating the summary
summary(mcdonalds$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 160.0 240.0 285.6 320.0 1270.0
hist(mcdonalds$cal_fat)
summary(dairy_queen$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 160.0 220.0 260.5 310.0 670.0
hist(dairy_queen$cal_fat)
dairy_queen_mean <- mean(dairy_queen$cal_fat)
dairy_queen_mean
## [1] 260.4762
dairy_queen_sd <- sd(dairy_queen$cal_fat)
dairy_queen_sd
## [1] 156.4851
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dairy_queen_mean, sd = dairy_queen_sd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Exercise 2: Based on the this plot, does it appear that the data follow a nearly normal distribution?
ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_line(stat = "qq")
# simulating data from a normal distribution using rnorm.
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dairy_queen_mean, sd = dairy_queen_sd)
sim_norm
## [1] 286.73005 65.55379 312.13604 271.23463 222.89668 130.65989 394.76504
## [8] 431.88867 176.25874 127.78394 494.68598 262.82159 268.50988 175.31068
## [15] 170.90054 378.78251 232.00168 279.00263 398.47843 524.81592 199.27322
## [22] 291.06216 57.78458 95.63536 127.63341 189.71632 90.79967 426.15706
## [29] 315.06638 372.19844 -25.78780 180.92252 128.87719 392.75177 341.77534
## [36] 301.74861 96.21391 112.08087 178.50369 134.86865 352.72005 455.37863
qqnormsim(sample = cal_fat, data = dairy_queen)
# Exercise 4: Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the calories are nearly normal?
# The Dairy Queen cal_fat data (plotted below) is nearly normal. It closely follows a diagonal line and is incredibly
# similar to the plots generated by qqnormsim (ie. sim 7).
The Dairy Queen cal_fat data (plotted below) is nearly normal. Although the slope is rather small near the beginning and rather larger later on, it does form a diagonal line and closely similar to a couple of the simulated plots up until these higher values (ie. sim 1 or 3).
qqnormsim(sample = cal_fat, data = mcdonalds)
# Normal Probabilities:
1 - pnorm(q = 600, mean = dairy_queen_mean, sd = dairy_queen_sd)
## [1] 0.01501523
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0476
#Arby's >30gs protein calculations:
arbys <- fastfood %>%
filter(restaurant == "Arbys")
arbys_mean <- mean(arbys$protein)
arbys_sd <- sd(arbys$protein)
1 - pnorm(q = 30, mean = arbys_mean, sd = arbys_sd)
## [1] 0.4760042
arbys %>%
filter(protein > 30) %>%
summarise(percent = n() / nrow(arbys))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.436
Exercise 7: Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium?
Based on the plots below, Burger King and Chick Fil-A had the distributions closest to normal for sodium.
#Arbys sodium plot
arbys <- fastfood %>%
filter(restaurant == "Arbys")
qqnorm(arbys$sodium, main = "Arbys")
#Burger King sodium plot
bk <- fastfood %>%
filter(restaurant == "Burger King")
qqnorm(bk$sodium, main = "Burger King")
#Chick Fil-A sodium plot **
chick_fil_a <- fastfood %>%
filter(restaurant == "Chick Fil-A")
qqnorm(chick_fil_a$sodium, main = "Chick Fil-A")
# Exercise 8:Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?
#Subway sodium plot
sw <- fastfood %>%
filter(restaurant == "Subway")
qqnorm(sw$sodium, main = "Subway")
#Taco Bell sodium plot
taco_bell <- fastfood %>%
filter(restaurant == "Taco Bell")
qqnorm(taco_bell$sodium, main = "Taco Bell")
and the histogram confirms this with data being concentrated on the left with a tail running to the right.
#Normal plot for total carbohydrates from Dairy Queen
qqnorm(fastfood$total_carb, main = "Dairy Queen Carbs")
qqline(fastfood$total_carb)
summary(fastfood$total_carb)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 28.50 44.00 45.66 57.00 156.00
hist(fastfood$total_carb)