library(tidyverse)
library(openintro)

##head shows a summary similar to glimpse
head(fastfood)
## # A tibble: 6 x 17
##   restaurant item       calories cal_fat total_fat sat_fat trans_fat cholesterol
##   <chr>      <chr>         <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
## 1 Mcdonalds  Artisan G~      380      60         7       2       0            95
## 2 Mcdonalds  Single Ba~      840     410        45      17       1.5         130
## 3 Mcdonalds  Double Ba~     1130     600        67      27       3           220
## 4 Mcdonalds  Grilled B~      750     280        31      10       0.5         155
## 5 Mcdonalds  Crispy Ba~      920     410        45      12       0.5         120
## 6 Mcdonalds  Big Mac         540     250        28      10       1            80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## #   sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## #   salad <chr>
#There are 17 measurements because there are 17 columns but for this part only the restaurants, calories, calories from fat are taken from McDonalds and Dairy Queen.

mcdonalds <- fastfood %>%
  filter (restaurant == "Mcdonalds")
dairy_queen <- fastfood %>%
  filter (restaurant == "Dairy Queen")

Exercise 1

Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?

Based on the histograms below the centers of the plots are on similar places which is something over 200. The shape of the McDonalds histogram shows an extreme right skew and for Dairy Queen, while it is also right skewed it is not as skewed as the Mcdonalds graph. The spread for the Dairy Queen data is tighter because it only goes from 0 to 700 but in the McDonalds data it is 0 to 1400.
#Making histograms for the data
#histogram of McDonalds
hist(mcdonalds$cal_fat)

boxplot(mcdonalds$cal_fat, horizontal=TRUE) 

#histogram of dairy queen
hist(dairy_queen$cal_fat)

boxplot(dairy_queen$cal_fat, horizontal=TRUE) 

Exercise 2

Based on the this plot, does it appear that the data follow a nearly normal distribution?

While the data points don’t fall into the normal curve perfectly the data does have a bell shape, where the ends get closer to zero and in the middle there is a peak.
#This adds mean and sd for dairy queen so it is easier to access them when they are named
dqmean <- mean(dairy_queen$cal_fat)
dqsd<- sd(dairy_queen$cal_fat)

#Adding a density histogram and overlaying a curve on the histogram to see if it is relatively normal.
#the stat function adds the normal line on the curve using the same mean and sq and colors it to red
ggplot(data=dairy_queen, aes(x = cal_fat)) +
  geom_blank() +
  geom_histogram(aes(y=..density..)) +
  stat_function(fun= dnorm, args = c(mean=dqmean, sd=dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#adding a bins part to change the amount of bins on the histogram which gives another look into the data
ggplot(data=dairy_queen, aes(x = cal_fat)) +
  geom_blank() +
  geom_histogram(aes(y=..density..), bins=20) +
  stat_function(fun= dnorm, args = c(mean=dqmean, sd=dqsd), col = "blue")

Exercise 3

Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.)

All points do not fall on the line and compared to the real data it is very similar to each other. The data could be relatively normal using the qqplot and comparing the simulated data to the actual data.
#This makes a qq plot to see if the graph is normal or not but unable to determine how close it really is
ggplot(data = dairy_queen, aes(sample = cal_fat)) + 
  geom_line(stat = "qq")

#this will make a simulated normal plot to compare the dairy queen data to and then gg plot with graph it
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)

ggplot(data =NULL , aes(sample = sim_norm)) +
  geom_line(stat="qq")

Exercise 4

Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the calories from fat are nearly normal?

Comparing the two plots (real vs simulated) it shows that it is very similar to each other for the most part and since the plot of the simulated data shows that it is relatively normal that means that the actual dairy queen data is relatively normal.

Exercise 5

Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution.

Comparing the two qqplots of McDonalds calories and the simulated graph clearly show that the Mcdonalds calories data is not relatively normal. The qqplot data of the McDonalds calories shows a right skew in the data.
#In this chunk of code it is going to look at the mcdonalds data of calories and plotting a qqplot from it
ggplot(data = mcdonalds, aes(sample = calories)) + 
  geom_line(stat = "qq")

#In this chunk it is going to simulate the data using the mean and sd from the mcdonalds data for calories
mcmean<- mean(mcdonalds$calories)
mcsd<- sd(mcdonalds$calories)

sim1_norm<- rnorm(n =nrow(mcdonalds), mean= mcmean, sd =mcsd )

ggplot(data = NULL, aes(sample = sim1_norm)) + 
  geom_line(stat = "qq")

Exercise 6

Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods?

In the first question where it ask for the sodium content less than 1000 the numbers for pnorm and using the empirical method are fairly close to each other at 0.33. In the second question, the numbers are still fairly close at 0.113 compared to 0.119 which could indicate that this distribution is relatively normal.

Question 1

What is the probability that a McDonald’s menu item has sodium content less than 1000?

#this would check the mean and standard deviation in sodium of mcdonalds item
mean(mcdonalds$sodium)
## [1] 1437.895
sd(mcdonalds$sodium)
## [1] 1036.172
#Using pnorm to find the probability
pnorm(1000, 1437.895, 1036.172)
## [1] 0.3362905
#Using empirical method
mcdonalds %>%
  filter(sodium<1000) %>%
  summarise(percent = n() / nrow(mcdonalds))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.333

Question 2

What is the probability that an item in Dairy Queen has total fat above 50?

#show mean and sd of total fat for dq items
mean(dairy_queen$total_fat)
## [1] 28.85714
sd(dairy_queen$total_fat)
## [1] 17.51873
#using pnorm to find the probability
1- pnorm(50,28.857, 17.518)
## [1] 0.1137295
#using empirical method to find probability
dairy_queen %>%
    filter(total_fat>50) %>%
  summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1   0.119
#showing how to use pnorm to find the area/probability that a Dairy Queen item has more than 600 calories from fat (1- is being used because it looking at the right side of the data)
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)
## [1] 0.01501523
#this other way is the empirical version but seems way more complicated to code compared to just putting pnorm but it is like checking how many data points are more than 600 then dividing that by sample size

#The data is not reasonably close it is 1% compared to 5% 

dairy_queen %>% 
  filter(cal_fat > 600) %>%
  summarise(percent = n() / nrow(dairy_queen))
## # A tibble: 1 x 1
##   percent
##     <dbl>
## 1  0.0476

Exercise 7

Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium?

Burger King seems to have the closest to normal for sodium.

Insert any text here.

#Plotting a qqplot for all the restaurants sodium level and seeing which one is the most normal
qqnorm(mcdonalds$sodium, main = "McDonalds")

qqnorm(dairy_queen$sodium, main = "Dairy Queen")

#This basically looks at fast food and looks for arbys in the restaurants and then the 

arbys<- fastfood %>%
  filter(restaurant == "Arbys")
  qqnorm(arbys$sodium, main="Arbys")

burger_king<- fastfood %>%
  filter(restaurant == "Burger King")
  qqnorm(burger_king$sodium, main="Burger King")

taco_bell<- fastfood %>%
  filter(restaurant == "Taco Bell")
  qqnorm(taco_bell$sodium, main="Taco Bell")

subway<- fastfood %>%
  filter(restaurant == "Subway")
  qqnorm(subway$sodium, main="Subway")

sonic<- fastfood %>%
  filter(restaurant == "Sonic")
  qqnorm(sonic$sodium, main="Sonic")

chickfila<- fastfood %>%
  filter(restaurant == "Chick Fil-A")
  qqnorm(chickfila$sodium, main="Chick Fil-A")

Exercise 8

Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?

There is a stepwise to some of the plots because of the diverse menu choices from these places from fries to drinks to burgers which could cause the pattern.

Exercise 9

As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings.

Based on the normal probability plot it could be symmetric because nothing really shoots up or down that much and based on the histogram the points tend to be all around the same frequency but on the right side it is much lower.
#I am going to use taco bell for total carbohydrates
qqnorm(taco_bell$total_carb, main="Taco Bell")

hist(taco_bell$total_carb)

