library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
head(fastfood)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti… 380 60 7 2 0 95
## 2 Mcdonalds Sing… 840 410 45 17 1.5 130
## 3 Mcdonalds Doub… 1130 600 67 27 3 220
## 4 Mcdonalds Gril… 750 280 31 10 0.5 155
## 5 Mcdonalds Cris… 920 410 45 12 0.5 120
## 6 Mcdonalds Big … 540 250 28 10 1 80
## # … with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
summary(mcdonalds$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 160.0 240.0 285.6 320.0 1270.0
summary(dairy_queen$cal_fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 160.0 220.0 260.5 310.0 670.0
attach(fastfood)
par(mfrow=c(2,2))
hist(mcdonalds$cal_fat, breaks=15, main ="McDonalds Fat Calories", xlab="Calories from fat", col = "lightyellow" )
hist(dairy_queen$cal_fat, breaks = 15, main ="Dairy Queen Fat Calories", xlab="Calories from fat", col = "lightblue")
ggplot(data = fastfood) +
geom_boxplot(mapping = aes(x = restaurant, y = cal_fat)) +
coord_flip()
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
##### Simulating data from a normal distribution using rnorm:
set.seed(06151982) # make sure to change the seed
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
glimpse(sim_norm)
## num [1:42] 424 416 110 402 228 ...
head(sim_norm)
## [1] 424.0123 416.1715 109.5504 401.5509 228.3747 524.7356
summary(sim_norm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -61.5 155.8 318.3 294.2 422.1 617.6
hist(sim_norm)
#### Exercise 3 - Evaluating the normal distribution ###### Based on the following normal probability plot it appears that all the points DO NOT fall on a perfect line but the simulated probability plot is very similar to the original data.
sim_norm <- as_tibble(sim_norm)
##ggplot(data = sim_norm, aes(sample = df_sim_norm)) +
geom_line(stat = "qq")
## geom_line: na.rm = FALSE, orientation = NA
## stat_qq: na.rm = FALSE
## position_identity
set.seed(06151983) # make sure to change the seed
qqnormsim(sample = cal_fat, data = dairy_queen)
#### Exercise 4 ###### The the normal probability plot of the original data (fat calories from Dairy Queen) looks very similar to the simulated normal probability plots. The simulations, therefore, show that the fat calories depicted in the normal probability plot of Dairy Queen are nearly normal.
dqmean <- mean(mcdonalds$cal_fat)
dqsd <- sd(mcdonalds$cal_fat)
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###### 5b. Normal probability plot for mcdonalds fat calorie data distribution
ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_line(stat = "qq")
###### 5c. Simulating data from a normal distribution using rnorm, and plotting it on a histogram, and a Normal probability plot.
set.seed(06151984) # make sure to change the seed
sim_norm_2 <- rnorm(n = nrow(mcdonalds), mean = dqmean, sd = dqsd)
glimpse(sim_norm_2)
## num [1:57] 626.8 20.1 659.4 237.7 22.7 ...
head(sim_norm_2)
## [1] 626.80690 20.12813 659.41600 237.73792 22.70363 400.23687
summary(sim_norm_2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -154.7 188.4 329.0 306.6 439.3 733.2
hist(sim_norm_2)
sim_norm_2
## [1] 626.80690 20.12813 659.41600 237.73792 22.70363 400.23687
## [7] 188.36680 382.59391 223.29162 10.30121 265.76931 384.96363
## [13] 329.03010 458.62670 346.05236 465.68589 288.70323 345.48978
## [19] 297.21017 461.63112 338.07513 294.85164 439.30188 584.27646
## [25] 486.39517 137.34872 364.89432 405.64326 266.00402 235.57064
## [31] 424.42116 149.24786 299.29086 331.72962 180.73649 261.56766
## [37] 238.54578 391.09660 490.01215 222.26794 -82.19240 441.12585
## [43] 615.23644 291.61243 -47.40149 177.35154 574.20991 460.09428
## [49] 612.97192 367.77919 41.10039 16.23121 733.21979 92.01541
## [55] -33.08123 417.35128 -154.72108
df_sim_norm_2 <- data.frame(sim_norm_2)
ggplot(data = df_sim_norm_2, aes(sample = sim_norm_2)) +
geom_line(stat = "qq")
###### 5d.Function displyaing multiple normal data probability simulations (in Q-Q plots) to compare against the original data in the top left corner.
set.seed(06151984) # make sure to change the seed
qqnormsim(sample = cal_fat, data = mcdonalds)
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)
## [1] 0.07733771
dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0476
1 - pnorm(q = 350, mean = dqmean, sd = dqsd)
## [1] 0.3853452
mcdonalds %>%
filter(cal_fat > 350) %>%
summarise(percent = n() / nrow(mcdonalds))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.211
1 - pnorm(q = 350, mean = dqmean, sd = dqsd)
## [1] 0.3853452
#chick_fil_A
#chick_fil_A %>%
# filter(cal_fat>350) %>%
#summarise(percent = n()/nrow(chick_fil_A))
ggplot(data = fastfood) +
geom_boxplot(mapping = aes(x = restaurant, y = sodium)) +
coord_flip()
###### Frequency histograms to study the shape of the distribution of the data. By eyeballing the histograms, it appears that Burger King and Subway data distribution may be the closest to the normal curve.
ggplot(data = fastfood) +
geom_histogram(aes(x = sodium), bins = 25, colour = "black", fill = "white") +
facet_wrap(~ restaurant)
###### Density histograms with a normal distribution curve overlay to study the shape of the distribution of the data. By looking at the histograms, it appears that Burger King, Subway, and Sonic data distribution may be the closest to the normal curve.
dqmean <- mean(fastfood$sodium)
dqsd <- sd(fastfood$sodium)
ggplot(data = fastfood, aes(x = sodium)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +
facet_wrap(~restaurant)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###### Normal probability plot os sodium for all fastfood restaurants being evaluated in this lab. In these graphs the curves more closely resembling a normal distribution appear to be those of Arby’s and Burger King.
ggplot(data = fastfood, aes(sample = sodium)) +
geom_line(stat = "qq") +
facet_wrap(~restaurant)
###### Disply of multiple normal data probability simulations (in Q-Q plots) to compare against the original restaurant data for sodium in the top left corner. ###### Of all the restaurants, it appears that Taco Bell data is the closest to a normal distribution.
Taco_Bell <- fastfood %>%
filter(restaurant == "Taco Bell")
Subway <- fastfood %>%
filter(restaurant == "Subway")
Sonic <- fastfood %>%
filter(restaurant == "Sonic")
Burger_King <- fastfood %>%
filter(restaurant == "Burger King")
Arbys <- fastfood %>%
filter(restaurant == "Arbys")
set.seed(06151985) # make sure to change the seed
qqnormsim(sample = sodium, data = Arbys)
set.seed(06151986) # make sure to change the seed
qqnormsim(sample = sodium, data = Burger_King)
#set.seed(06151987) # make sure to change the seed
#qqnormsim(sample = sodium, data = chick_fil_A)
set.seed(06151988) # make sure to change the seed
qqnormsim(sample = sodium, data = dairy_queen)
set.seed(06151989) # make sure to change the seed
qqnormsim(sample = sodium , data = mcdonalds)
set.seed(06151990) # make sure to change the seed
qqnormsim(sample = sodium , data = Sonic)
set.seed(06151991) # make sure to change the seed
qqnormsim(sample = sodium , data = Subway)
set.seed(06151992) # make sure to change the seed
qqnormsim(sample = sodium , data = Taco_Bell)
#### Exercise 8 ###### Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. This is due to the data variables being discrete as opposed to continuous.
#ggplot(data = chick_fil_A, aes(sample = total_carb)) +
#geom_line(stat = "qq")
#hist(chick_fil_A$total_carb, breaks = 15, main = "Chick Fil-A Carbohydrates")
#dqmean <- mean(chick_fil_A$total_carb)
#dqsd <- sd(chick_fil_A$total_carb)
#ggplot(data = chick_fil_A, aes(x = total_carb)) +
# geom_blank() +
# geom_histogram(aes(y = ..density..), bins = 15) +
# stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")