library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(ggplot2)
library(patchwork)

Exercise 1 Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?

fastfood <-fastfood
head(fastfood)
## # A tibble: 6 x 17
##   restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##   <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
## 1 Mcdonalds  Arti…      380      60         7       2       0            95
## 2 Mcdonalds  Sing…      840     410        45      17       1.5         130
## 3 Mcdonalds  Doub…     1130     600        67      27       3           220
## 4 Mcdonalds  Gril…      750     280        31      10       0.5         155
## 5 Mcdonalds  Cris…      920     410        45      12       0.5         120
## 6 Mcdonalds  Big …      540     250        28      10       1            80
## # … with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## #   sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## #   salad <chr>

#this filters the dataset into Mcdonalds and Dairy queen

mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")

#calculate 5 number summary for mcdonalds and dairy_queen cal_fat

mcsummary <- summary(mcdonalds$cal_fat)
dqsummary <- summary(dairy_queen$cal_fat)
print(mcsummary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    50.0   160.0   240.0   285.6   320.0  1270.0
print(dqsummary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   160.0   220.0   260.5   310.0   670.0

#calculate the means and sd for mcdonalds and dairy_queen fat content

mcmean <- mean(mcdonalds$cal_fat)
mcsd <- sd(mcdonalds$cal_fat)
print(mcmean)
## [1] 285.614
print(mcsd)
## [1] 220.8993
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
print(dqmean)
## [1] 260.4762
print(dqsd)
## [1] 156.4851

#generate boxplots for mcdonalds and dairy_queen fat_cal data

mcboxplot <-ggplot(mcdonalds, aes(x=cal_fat)) + 
  geom_boxplot() +ggtitle("distribution fat calories mcdonalds") + 
  geom_vline(xintercept=mcmean, col = "red", lwd = 2)
dqboxplot <-ggplot(dairy_queen, aes(x=cal_fat)) + 
  geom_boxplot() +ggtitle("distribution fat calories dairy_queen") +
  geom_vline(xintercept=dqmean, col = "red", lwd= 2)

#generate histogram dairy_queen fat_cal

dqhistogram <-ggplot(data = dairy_queen, aes(x = cal_fat)) +  
geom_blank() + 
geom_histogram(bins = 10, aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +ggtitle("distribution fat calories dairy_queen") + geom_vline(xintercept=dqmean, col = "red")

#generate histogram mcdonalds fat_cal

mchistogram <-ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = mcmean, sd = mcsd), col = "tomato") +ggtitle("distribution of calories fat mcdonalds") + geom_vline(xintercept=mcmean, col = "red")
mcboxplot+dqboxplot

mchistogram + dqhistogram

###Answer Exercise 1 - how do their centers, shapes, and spreads compare? Comparing the boxplots and histograms for cal_fat dairy_queen and mcdonalds, we note the following trends: #both datasets have several outliers noted on the boxplots #the mean fat calories appear greater in the mcdonalds data vs the dairy queen data #the spread of the data for fat calories appears wider for mcdonalds vs the dairy queen data #both datasets are skewed to the right which may represent the effect of the outliers noted in the boxplots

Exercise 2 Based on the this plot, does it appear that the data follow a nearly normal distribution?

#construct a Q-Q plot or normal probability plot for both data sets

mcQQ <-ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot fat-cal mcdonalds")
dqQQ <-ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq") + stat_qq()+stat_qq_line() +
ggtitle("normal prob plot fat-cal dairy_queen")
 mcQQ + dqQQ

#Answer Exercise 2 - the above normal prob plots do not suggest that data sets are normally distributed. Rather they confirm the skew noted in the histogram plots from exercise 1

Exercise 3 Make a normal probability plot of sim_norm . Do all of the points fall on the line? How does this plot compare to the probability plot for the real data?

#(Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.)

#now generate simulated normal distribution of dairy_queen

dqsim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)

#now generate a normal probability plot for the dqsim_norm data

qqnormsim(sample = cal_fat, data = dairy_queen)

### Exercise 4 Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that dairy_queen cal-fat data are nearly normal?

#ANS EX 4 The normal probability plots for fat_cal dairy_queen do not look similar to the normal probability plots from dqsim_norm data. The dqsim-norm probabilty plots appear normally distributed.

Exercise 5 Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution.

#generate a simulated normal distribution from mcdonalds

mcsim_norm <- rnorm(n = nrow(mcdonalds), mean = dqmean, sd = dqsd)

#generate a normal probabilty plot for mcsim_norm

qqnormsim(sample = cal_fat, data = mcdonalds)

# Answer Exercise 5 - similar to exercise 4 , the fat_cal mcdonalds normal probability plots and the simulated mcsim_norm plots are dissimiliar.

#normal probabilities # calculate theoretical area under curve for dairy_queen fat_cal data Pr{y>600}

(1 - pnorm(q = 600, mean = dqmean, sd = dqsd))*100
## [1] 1.501523

#calculate empiric probability for actual data

dairy_queen %>%
filter(cal_fat > 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1  0.0476

Exercise 6 Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods?

#what is theoretical probability for dairy_queen cal_fat dataset Pr(Y<600)

(pnorm(600,260.47,156.485))*100
## [1] 98.49863

#interpretation: for sample drawn randomly from the dairy_queen cal_fat dataset, there is a 98.4% chance that it is less than 600 calories

#what is the empiric probability for Pr{Y<600}

dairy_queen %>%
filter(cal_fat < 600) %>%
summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.952

#interpretation: 95.2% of data in dairy_queen cal_fat dataset are < 600 calories

#what is theoretical probability for mcdonalds cal_fat dataset Pr(Y<600)

(pnorm(600,mcmean,mcsd))*100
## [1] 92.26623

#interpretation: for sample drawn randomly from the dairy_queen cal_fat dataset, there is a 92.2% chance that it is less than 600 calories

#what is the empiric probability for Pr{Y<600}

mcdonalds %>%
filter(cal_fat < 600) %>%
summarise(percent = n() / nrow(mcdonalds))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.895

#interpretation: 89.4% of data in mcdonalds cal_fat dataset are < 600 calories

#the theoretical and empiric calculations more closely agreed for the dairy_queen cal_fat data set.

Exercise 7 Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium?

mcmean_sodium <- mean(mcdonalds$sodium)
mcsd_sodium <- sd(mcdonalds$sodium)
print(mcmean_sodium)
## [1] 1437.895
print(mcsd_sodium)
## [1] 1036.172
mchist_sodium <-ggplot(data = mcdonalds, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = mcmean, sd = mcsd), col = "tomato") +ggtitle("distribution of sodium mcdonalds") + geom_vline(xintercept=mcmean_sodium, col = "red")
mcQQ <-ggplot(data = mcdonalds, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium mcdonalds")
mchist_sodium + mcQQ

#interpretation the histogram and normal probablity plots for mcdonalds_sodium do not appear normal distributed. The data are heavily skewed to the right.

#now analyze dairy_queen_sodium data for normality.

dqmean_sodium <- mean(dairy_queen$sodium)
dqsd_sodium <- sd(dairy_queen$sodium)
print(mcmean_sodium)
## [1] 1437.895
print(mcsd_sodium)
## [1] 1036.172
dqhist_sodium <-ggplot(data = dairy_queen, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +ggtitle("distribution of sodium dairy_queen") + geom_vline(xintercept=dqmean_sodium, col = "red")
dqQQ <-ggplot(data = dairy_queen, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium dairy_queen")
dqhist_sodium + dqQQ

#interpretation the histogram and normal probablity plots for dairy_queen_sodium do not appear normal distributed. The data are heavily skewed to the right.

Exercise 7 Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium?

library(tidyverse)
library(openintro)
library(ggplot2)
library(patchwork)
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
Subway <- fastfood %>%
  filter(restaurant == "Subway")
Sonic <-fastfood %>%
  filter(restaurant == "Sonic")
Burger_King <- fastfood %>%
  filter(restaurant == "Burger King")
Sonic <- fastfood %>%
  filter(restaurant == "Sonic")
Arbys <- fastfood %>% 
  filter(restaurant == "Arbys")
Chick_Fil_A <- fastfood %>%
  filter(restaurant == "Chick Fil-A")
TacoBell <- fastfood %>% 
  filter(restaurant == "Taco Bell")
dqhist_sodium <-ggplot(data = dairy_queen, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +ggtitle("distribution of sodium dairy_queen") + geom_vline(xintercept=dqmean_sodium, col = "red")
mcmean <- mean(mcdonalds$sodium)
mchist_sodium <-ggplot(data = mcdonalds, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = mcmean, sd = dqsd), col = "tomato") +ggtitle("distribution of sodium mcdonalds") + geom_vline(xintercept=dqmean_sodium, col = "red")
sbmean_sodium <- mean(Subway$sodium)
sbsd_sodium <- sd(Subway$sodium)
sbhist_sodium <-ggplot(data = Subway, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = sbmean_sodium, sd = sbsd_sodium), col = "tomato") +ggtitle("distribution of sodium Subway") + geom_vline(xintercept=sbmean_sodium, col = "red")
bkmean_sodium <- mean(Burger_King$sodium)
bksd_sodium <- sd(Burger_King$sodium)
bkhist_sodium <-ggplot(data = Burger_King, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = bkmean_sodium, sd = bksd_sodium), col = "tomato") +ggtitle("distribution of sodium Burger king") + geom_vline(xintercept=bkmean_sodium, col = "red")
tbmean_sodium <- mean(TacoBell$sodium)
tbsd_sodium <- sd(TacoBell$sodium)
tbhist_sodium <-ggplot(data = TacoBell, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = tbmean_sodium, sd = tbsd_sodium), col = "tomato") +ggtitle("distribution of sodium Taco Bell") + geom_vline(xintercept=tbmean_sodium, col = "red")
abmean_sodium <- mean(Arbys$sodium)
absd_sodium <- sd(Arbys$sodium)
abhist_sodium <-ggplot(data = Arbys, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = abmean_sodium, sd = absd_sodium), col = "tomato") +ggtitle("distribution of sodium Arby's") + geom_vline(xintercept=sbmean_sodium, col = "red")
snmean_sodium <- mean(Sonic$sodium)
snsd_sodium <- sd(Sonic$sodium)
snhist_sodium <-ggplot(data = Sonic, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = snmean_sodium, sd = snsd_sodium), col = "tomato") +ggtitle("distribution of sodium Sonic") + geom_vline(xintercept=snmean_sodium, col = "red")
cfmean_sodium <- mean(Chick_Fil_A$sodium)
cfsd_sodium <- sd(Chick_Fil_A$sodium)
cfhist_sodium <-ggplot(data = Chick_Fil_A, aes(x = sodium)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = cfmean_sodium, sd = cfsd_sodium), col = "tomato") +ggtitle("distribution of sodium ChickFil") + 
  geom_vline(xintercept=cfmean_sodium, col = "red")
dqhist_sodium + mchist_sodium + sbhist_sodium + bkhist_sodium + tbhist_sodium + 
  abhist_sodium + snhist_sodium + cfhist_sodium

mcQQ_sodium <-ggplot(data = mcdonalds, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium mcdonalds")
dqQQ_sodium <-ggplot(data = dairy_queen, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium dairy queen")
sbQQ_sodium <-ggplot(data = Subway, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium Subway")
bkQQ_sodium <-ggplot(data = Burger_King, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium burger king")
tbQQ_sodium <-ggplot(data = TacoBell, aes(sample =sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium taco bell")
abQQ_sodium <-ggplot(data = Arbys, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium Arby's")
snQQ_sodium <-ggplot(data = Sonic, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob sodium Sonic")
ckQQ_sodium <-ggplot(data = Chick_Fil_A, aes(sample = sodium)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot sodium Chick Fil-A")
dqQQ_sodium + mcQQ_sodium + sbQQ_sodium + bkQQ_sodium + tbQQ_sodium + abQQ_sodium + snQQ_sodium + ckQQ_sodium

#from QQ plots analysis, distribution for sodium from Burger king, Taco bell, and Arby’s appear most normally distributed.

bksharpiro <-shapiro.test(rnorm(n=100, bkmean_sodium, bksd_sodium))
tbsharpiro <-shapiro.test((rnorm(n=100, tbmean_sodium, tbsd_sodium)))
absharpiro <-shapiro.test((rnorm(n=100, abmean_sodium, absd_sodium)))
print(bksharpiro)
## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(n = 100, bkmean_sodium, bksd_sodium)
## W = 0.98687, p-value = 0.4291
print(tbsharpiro)
## 
##  Shapiro-Wilk normality test
## 
## data:  (rnorm(n = 100, tbmean_sodium, tbsd_sodium))
## W = 0.98558, p-value = 0.3496
print(absharpiro)
## 
##  Shapiro-Wilk normality test
## 
## data:  (rnorm(n = 100, abmean_sodium, absd_sodium))
## W = 0.99323, p-value = 0.9016

#shapiro test analysis of burger king, taco bell, and arby’s sodium data notes all P value > 0.1. therefore we have no compelling evidence for nonnormality.

Exercise 8 Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?

#Answer - the stepwise pattern in the normal prob plots may be due to outliers in the data near the right side of the plots

Exercise 9 As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings.

#assess the mcodonalds total_carbohydrate data for normality vs skewness. # generate mean and sd total_carb

mcmean_carb <- mean(mcdonalds$total_carb)
mcsd_carb <- sd(mcdonalds$total_carb)
print(mcmean_carb)
## [1] 48.78947
print(mcsd_carb)
## [1] 26.44248

#generate histogram total_carb

mchist_carb <-ggplot(data = mcdonalds, aes(x = total_carb)) +
geom_blank() +
geom_histogram(bins = 10, aes(y = ..density..)) + 
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato") +ggtitle("distribution of total_carb mcdonalds") + geom_vline(xintercept=dqmean_sodium, col = "red")

#generate normal prob plot total_carb

mcQQ_carb <-ggplot(data = mcdonalds, aes(sample = total_carb)) +
geom_line(stat = "qq") +  stat_qq()+stat_qq_line() +
ggtitle("normal prob plot total_carb mcdonalds")
mchist_carb + mcQQ_carb

#interpretation - similiar to all prior data sets analyzed, the mcdonalds total_carb data is heavy skewed to the right.

