setwd("~/DATA110")
Let’s load the packages.
library(tidyverse)
## -- Attaching packages -------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
head(fastfood)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
glimpse(fastfood)
## Rows: 515
## Columns: 17
## $ restaurant <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mc...
## $ item <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smoke...
## $ calories <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380,...
## $ cal_fat <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 3...
## $ total_fat <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, ...
## $ sat_fat <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0...
## $ trans_fat <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, ...
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 12...
## $ sodium <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 129...
## $ total_carb <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67,...
## $ fiber <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5...
## $ sugar <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3...
## $ protein <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33,...
## $ vit_a <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4,...
## $ vit_c <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6,...
## $ calcium <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, ...
## $ salad <chr> "Other", "Other", "Other", "Other", "Other", "Other", "...
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
head(mcdonalds)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
**Answer: Both appear skewed to the right. Their centers are about 200 and 250, respectively so fairly close. The spread of McDonalds is slightly wider as it extends beyond 1000 while Dairy Queen stops below 750. They do not appear bell-shaped to me.
dairy_queen %>%
ggplot() +
geom_histogram(aes(x = cal_fat), bins = 7) +
ggtitle("Dairy Queen Calories from Fat") +
xlab("Calories from Fat") +
ylab("Frequency")
mcdonalds %>%
ggplot() +
geom_histogram(aes(x = cal_fat), bins = 7) +
ggtitle("McDonalds Calories from Fat") +
xlab("Calories from Fat") +
ylab("Frequency")
fastfood %>%
filter(restaurant == "Mcdonalds" | restaurant == "Dairy Queen") %>%
ggplot() +
geom_histogram(aes(x = cal_fat), bins = 7) +
ggtitle("Calories from Fat by Restaurant") +
xlab("Calories from Fat") +
ylab("Frequency") +
facet_wrap(. ~restaurant)
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
dqmean
## [1] 260.4762
dqsd
## [1] 156.4851
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..), bins = 7) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +
xlab("calories from fat") +
ggtitle("Dairy Queen")
mcmean <- mean(mcdonalds$cal_fat)
mcsd <- sd(mcdonalds$cal_fat)
mcmean
## [1] 285.614
mcsd
## [1] 220.8993
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..), bins = 7) +
stat_function(fun = dnorm, args = c(mean = mcmean, sd = mcsd), col = "tomato") +
xlab("calories from fat") +
ggtitle("McDonalds")
Answer: It’s hard to tell so we will explore with a QQ plot
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
qqnorm(sim_norm)
The above still does not show all data point on a straight line; it looks to similar to the real data.
Answer: Yes, they look very similar, so this is evidence that they may be nearly normal despite my assuming the histogram was off; however the Shapiro-Wild Test very strongly suggest this data is not normal (p = .007)
# perform shapiro test for normality
shapiro.test(dairy_queen$cal_fat)
##
## Shapiro-Wilk normality test
##
## data: dairy_queen$cal_fat
## W = 0.92299, p-value = 0.007568
Probability Chick Fil-A sugar is >10 and probability that calories are >400
chickfila <- fastfood %>%
filter(restaurant == "Chick Fil-A")
head(chickfila)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chick Fil~ Char~ 430 144 16 8 0 85
## 2 Chick Fil~ Char~ 310 54 6 2 0 55
## 3 Chick Fil~ Chic~ 270 99 11 2.5 0 45
## 4 Chick Fil~ 1 Pi~ 120 54 6 3 0 25
## 5 Chick Fil~ 2 Pi~ 230 108 12 3 0 55
## 6 Chick Fil~ 3 Pi~ 350 153 17 3 0 70
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
#puts Chick Fil-A into its own dataframe names chickfila
sugarmean <- mean(chickfila$sugar)
sugarsd <- sd(chickfila$sugar)
calmean <- mean(chickfila$calories)
calsd <- sd(chickfila$calories)
####pnorm for sugar and calories
1-pnorm(10, mean = sugarmean, sd = sugarsd)
## [1] 0.05521115
1-pnorm(400, mean = calmean, sd = calsd)
## [1] 0.4718786
#theoretical probabilities are 1 - the results printed below
chickfila %>%
filter(sugar > 10) %>%
summarise(percent = n() / nrow(chickfila))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.0370
chickfila %>%
filter(calories > 400) %>%
summarise(percent = n() / nrow(chickfila))
## # A tibble: 1 x 1
## percent
## <dbl>
## 1 0.481
**They are as close as the others for McDonalds and Dairy Queen as far as appearing nearly normal
**from the histograms it looks like Sonic or Burger King might be closest to normal.
fastfood %>%
group_by(restaurant) %>%
ggplot() +
geom_histogram(aes(x = sodium), bins = 15) +
ggtitle("Restaurant Sodium") +
xlab("Sodium") +
ylab("Frequency") +
facet_wrap(. ~restaurant)
fastfood %>%
group_by(restaurant) %>%
ggplot(aes(sample = sodium)) +
geom_line(stat = "qq") +
facet_wrap(.~restaurant)
**Looks like Burger King or Taco Bell are closest to normal. Let’s look at Burger King, Taco Bell, Sonic and Arbys by creating a dataframe for each to analyze
taco_bell <- fastfood %>%
filter(restaurant == "Taco Bell")
burger_king <- fastfood %>%
filter(restaurant == "Burger King")
sonic <- fastfood %>%
filter(restaurant == "Sonic")
arbys <- fastfood %>%
filter(restaurant == "Arbys")
**Answer: Arbys and Burger King are normal under this test whereas the results strongly suggest Taco Bell and Sonic are not
shapiro.test(taco_bell$sodium)
##
## Shapiro-Wilk normality test
##
## data: taco_bell$sodium
## W = 0.95501, p-value = 0.000699
shapiro.test(burger_king$sodium)
##
## Shapiro-Wilk normality test
##
## data: burger_king$sodium
## W = 0.97291, p-value = 0.1331
shapiro.test(sonic$sodium)
##
## Shapiro-Wilk normality test
##
## data: sonic$sodium
## W = 0.82286, p-value = 1.784e-06
shapiro.test(arbys$sodium)
##
## Shapiro-Wilk normality test
##
## data: arbys$sodium
## W = 0.97073, p-value = 0.1985
**Answer: There are many repeated values
**Answer: appears skewed to the right
sonicplot <- sonic %>%
ggplot() +
geom_line(aes(sample = total_carb), stat = "qq") +
ggtitle("Sonic Normal Probability Plot - Carbohydrates")
sonicplot
**Answer: confirms somewhat skewed to the right
sonic_hist <- sonic %>%
ggplot() +
geom_histogram(aes(x = total_carb), binwidth = 10) +
xlab("total carbohydrates") +
ylab("frequency") +
ggtitle("Sonic Carbohydrates")
sonic_hist
scarbsmean = mean(sonic$total_carb)
scarbssd = sd(sonic$total_carb)
ggplot(data = sonic, aes(x = total_carb)) +
geom_blank() +
geom_histogram(aes(y = ..density..), bins = 10) +
stat_function(fun = dnorm, args = c(mean = scarbsmean, sd = scarbssd), col = "red") +
xlab("Total Carbohydrates")
ggtitle("Sonic Carbohydrates")
## $title
## [1] "Sonic Carbohydrates"
##
## attr(,"class")
## [1] "labels"