The normal distribution

R Packages

library(tidyverse) #loading all library needed for this assignment
library(openintro)
head(fastfood)

## # A tibble: 6 x 17
##   restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##   <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
## 1 Mcdonalds  Arti~      380      60         7       2       0            95
## 2 Mcdonalds  Sing~      840     410        45      17       1.5         130
## 3 Mcdonalds  Doub~     1130     600        67      27       3           220
## 4 Mcdonalds  Gril~      750     280        31      10       0.5         155
## 5 Mcdonalds  Cris~      920     410        45      12       0.5         120
## 6 Mcdonalds  Big ~      540     250        28      10       1            80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## #   sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## #   salad <chr>

library(readxl)
library(readr)
library(plyr)
library(dplyr)
library(dice)
# #library(VennDiagram)
# #library(help = "dice")
# library(DBI)
# library(dbplyr)
# library(data.table)
# library(rstudioapi)
# library(RJDBC)
# library(odbc)
# library(RSQLite)
# library(rvest)
# library(stringr)
# library(readtext)
# library(ggpubr)
library(fitdistrplus)
library(ggplot2)
library(moments)
library(qualityTools)
library(normalp)
library(utils)
library(MASS)
library(qqplotr)
library(DATA606)

## 
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics 
## This package is designed to support this course. The text book used 
## is OpenIntro Statistics, 3rd Edition. You can read this by typing 
## vignette('os3') or visit www.OpenIntro.org. 
##  
## The getLabs() function will return a list of the labs available. 
##  
## The demo(package='DATA606') will list the demos that are available.

Exercise 1

Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare? Answers: The center is about the mean with a lowest tail being on the left, there is one mode about [100, 200] which appear to be more on the right of the mean. Thus, this is unimodal (kind of looking multimodal) right skewed. and if we were to draw a line , that will look like a right skewed, the spread looks from 0 (minimum fat calories) to 670 (maximum fat calories) , so range of 670-0 = 670

fastfood

## # A tibble: 515 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Mcdonalds  Arti~      380      60         7       2       0            95
##  2 Mcdonalds  Sing~      840     410        45      17       1.5         130
##  3 Mcdonalds  Doub~     1130     600        67      27       3           220
##  4 Mcdonalds  Gril~      750     280        31      10       0.5         155
##  5 Mcdonalds  Cris~      920     410        45      12       0.5         120
##  6 Mcdonalds  Big ~      540     250        28      10       1            80
##  7 Mcdonalds  Chee~      300     100        12       5       0.5          40
##  8 Mcdonalds  Clas~      510     210        24       4       0            65
##  9 Mcdonalds  Doub~      430     190        21      11       1            85
## 10 Mcdonalds  Doub~      770     400        45      21       2.5         175
## # ... with 505 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

view(fastfood)
summary(fastfood)

##   restaurant            item              calories         cal_fat      
##  Length:515         Length:515         Min.   :  20.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.: 330.0   1st Qu.: 120.0  
##  Mode  :character   Mode  :character   Median : 490.0   Median : 210.0  
##                                        Mean   : 530.9   Mean   : 238.8  
##                                        3rd Qu.: 690.0   3rd Qu.: 310.0  
##                                        Max.   :2430.0   Max.   :1270.0  
##                                                                         
##    total_fat         sat_fat         trans_fat      cholesterol    
##  Min.   :  0.00   Min.   : 0.000   Min.   :0.000   Min.   :  0.00  
##  1st Qu.: 14.00   1st Qu.: 4.000   1st Qu.:0.000   1st Qu.: 35.00  
##  Median : 23.00   Median : 7.000   Median :0.000   Median : 60.00  
##  Mean   : 26.59   Mean   : 8.153   Mean   :0.465   Mean   : 72.46  
##  3rd Qu.: 35.00   3rd Qu.:11.000   3rd Qu.:1.000   3rd Qu.: 95.00  
##  Max.   :141.00   Max.   :47.000   Max.   :8.000   Max.   :805.00  
##                                                                    
##      sodium       total_carb         fiber            sugar       
##  Min.   :  15   Min.   :  0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 800   1st Qu.: 28.50   1st Qu.: 2.000   1st Qu.: 3.000  
##  Median :1110   Median : 44.00   Median : 3.000   Median : 6.000  
##  Mean   :1247   Mean   : 45.66   Mean   : 4.137   Mean   : 7.262  
##  3rd Qu.:1550   3rd Qu.: 57.00   3rd Qu.: 5.000   3rd Qu.: 9.000  
##  Max.   :6080   Max.   :156.00   Max.   :17.000   Max.   :87.000  
##                                  NA's   :12                       
##     protein           vit_a            vit_c           calcium      
##  Min.   :  1.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 16.00   1st Qu.:  4.00   1st Qu.:  4.00   1st Qu.:  8.00  
##  Median : 24.50   Median : 10.00   Median : 10.00   Median : 20.00  
##  Mean   : 27.89   Mean   : 18.86   Mean   : 20.17   Mean   : 24.85  
##  3rd Qu.: 36.00   3rd Qu.: 20.00   3rd Qu.: 30.00   3rd Qu.: 30.00  
##  Max.   :186.00   Max.   :180.00   Max.   :400.00   Max.   :290.00  
##  NA's   :1        NA's   :214      NA's   :210      NA's   :210     
##     salad          
##  Length:515        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

# filtering the restaurant column to only McDonalds
mcdonalds <- fastfood %>%
  filter(restaurant == "Mcdonalds")
mcdonalds

## # A tibble: 57 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Mcdonalds  Arti~      380      60         7       2       0            95
##  2 Mcdonalds  Sing~      840     410        45      17       1.5         130
##  3 Mcdonalds  Doub~     1130     600        67      27       3           220
##  4 Mcdonalds  Gril~      750     280        31      10       0.5         155
##  5 Mcdonalds  Cris~      920     410        45      12       0.5         120
##  6 Mcdonalds  Big ~      540     250        28      10       1            80
##  7 Mcdonalds  Chee~      300     100        12       5       0.5          40
##  8 Mcdonalds  Clas~      510     210        24       4       0            65
##  9 Mcdonalds  Doub~      430     190        21      11       1            85
## 10 Mcdonalds  Doub~      770     400        45      21       2.5         175
## # ... with 47 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

view(mcdonalds)

#filtering the restaurant column to only Dairy Queen
dairy_queen <- fastfood %>%
  filter(restaurant == "Dairy Queen")
dairy_queen

## # A tibble: 42 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Dairy Que~ 1/2 ~     1000     660        74      26         2         170
##  2 Dairy Que~ 1/2 ~      800     460        51      20         2         135
##  3 Dairy Que~ 1/4 ~      630     330        37      13         1          95
##  4 Dairy Que~ 1/4 ~      540     270        30      11         1          70
##  5 Dairy Que~ 1/4 ~      570     310        35      11         1          75
##  6 Dairy Que~ Orig~      400     160        18       9         1          65
##  7 Dairy Que~ Orig~      630     310        34      18         2         125
##  8 Dairy Que~ 4 Pi~     1030     480        53       9         1          80
##  9 Dairy Que~ 6 Pi~     1260     590        66      11         1         120
## 10 Dairy Que~ Baco~      420     240        26      11         1          60
## # ... with 32 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

view(dairy_queen)
summary(dairy_queen)

##   restaurant            item              calories         cal_fat     
##  Length:42          Length:42          Min.   :  20.0   Min.   :  0.0  
##  Class :character   Class :character   1st Qu.: 350.0   1st Qu.:160.0  
##  Mode  :character   Mode  :character   Median : 485.0   Median :220.0  
##                                        Mean   : 520.2   Mean   :260.5  
##                                        3rd Qu.: 630.0   3rd Qu.:310.0  
##                                        Max.   :1260.0   Max.   :670.0  
##                                                                        
##    total_fat        sat_fat        trans_fat       cholesterol    
##  Min.   : 0.00   Min.   : 0.00   Min.   :0.0000   Min.   :  0.00  
##  1st Qu.:18.00   1st Qu.: 5.00   1st Qu.:0.0000   1st Qu.: 41.25  
##  Median :24.50   Median : 9.00   Median :1.0000   Median : 60.00  
##  Mean   :28.86   Mean   :10.44   Mean   :0.6786   Mean   : 71.55  
##  3rd Qu.:34.75   3rd Qu.:12.50   3rd Qu.:1.0000   3rd Qu.:100.00  
##  Max.   :75.00   Max.   :43.00   Max.   :2.0000   Max.   :180.00  
##                                                                   
##      sodium         total_carb         fiber            sugar       
##  Min.   :  15.0   Min.   :  0.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 847.5   1st Qu.: 25.25   1st Qu.: 1.000   1st Qu.: 3.000  
##  Median :1030.0   Median : 34.00   Median : 2.000   Median : 6.000  
##  Mean   :1181.8   Mean   : 38.69   Mean   : 2.833   Mean   : 6.357  
##  3rd Qu.:1362.5   3rd Qu.: 44.75   3rd Qu.: 3.000   3rd Qu.: 8.750  
##  Max.   :3500.0   Max.   :121.00   Max.   :12.000   Max.   :30.000  
##                                                                     
##     protein          vit_a        vit_c          calcium      
##  Min.   : 1.00   Min.   : 0   Min.   : 0.00   Min.   :  0.00  
##  1st Qu.:17.00   1st Qu.: 9   1st Qu.: 0.00   1st Qu.:  6.00  
##  Median :23.00   Median :10   Median : 4.00   Median : 10.00  
##  Mean   :24.83   Mean   :14   Mean   : 4.37   Mean   : 16.41  
##  3rd Qu.:34.00   3rd Qu.:20   3rd Qu.: 6.00   3rd Qu.: 20.00  
##  Max.   :49.00   Max.   :50   Max.   :30.00   Max.   :100.00  
##                  NA's   :15   NA's   :15      NA's   :15      
##     salad          
##  Length:42         
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

# making plot : Freq histogram

calories_Fat <- dairy_queen$cal_fat
summary(calories_Fat)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   160.0   220.0   260.5   310.0   670.0

calories_Fat

##  [1] 660 460 330 270 310 160 310 480 590 240 220 220 180 160 180  80  80 410 670
## [20] 440 140 200 160 310 240 130 430 310 430 180 180 220 280 120 270 190 170  20
## [39] 140 130   0 240

hist(calories_Fat)

# line fit distribution
qqnorm(calories_Fat); qqline(calories_Fat)

# Statistic parameters for the dairy queen, mean, standard deviation
dqmean <- mean(dairy_queen$cal_fat)
dqsd   <- sd(dairy_queen$cal_fat)
dqmean

## [1] 260.4762

dqsd

## [1] 156.4851

# histogram with added parameters
 hist(calories_Fat,
 main="Distributions of the amount of calories from fat ",
 xlab="Fat calories",
 xlim=c(0,700),
 col="blue",
 freq=FALSE
 )
curve(dnorm(x,mean=dqmean,sd=dqsd), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram

#Find max, min for histogram plot with details
is.na(calories_Fat) # checking if there is a missing data in the dataset, return false = no 'NA'

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE

sum(is.na(calories_Fat)) # file to big, checking the sum of all missing data , return = 0 , meaning no 'NA'

## [1] 0

max(calories_Fat) #summary actually gives this info

## [1] 670

min(calories_Fat)

## [1] 0

# Using a density histogram allows us to properly overlay a normal distribution curve over the histogram since the curve is a normal probability density function that also has area under the curve of 1. Frequency and density histograms both display the same exact shape; they only differ in their y-axis. You can verify this by comparing the frequency histogram you constructed earlier and the density histogram created by the commands below.

ggplot(data = dairy_queen, aes(x = cal_fat)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Exercise 2

Based on the this plot, does it appear that the data follow a nearly normal distribution? Answer: The normal Q-Q plot shows points adhering on the line…so it is a normal distribution.

# better fit curves
hist(calories_Fat, probability = TRUE)
lines(density(calories_Fat, cut = 0), col = "Red", lwd = 2)

# Eyeballing the shape of the histogram is one way to determine if the data appear to be nearly normally distributed, but it can be frustrating to decide just how close the histogram is to the curve. An alternative approach involves constructing a normal probability plot, also called a normal Q-Q plot for “quantile-quantile”.

ggplot(data = dairy_queen, aes(sample = cal_fat)) + 
  geom_line(stat = "qq")

# line fit distribution
qqnorm(calories_Fat); qqline(calories_Fat)

# A useful way to address this question is to rephrase it as: what do probability plots look like for data that I know came from a normal distribution? We can answer this by simulating data from a normal distribution using rnorm.

sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)

Exercise 3

Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.) Answer: Based on the Q-Q Plot most of the point fall on the line. This plot compare to the probability plot for the real data shows sglightly more fat calories adhering to the line.

hist(sim_norm, probability = TRUE, ylim = c(0, 0.003), breaks = 25)
x <- 0:700
y <- dnorm(x = x, mean = dqmean, sd = dqsd)
lines(x = x, y = y, col = "red")

qqnorm(sim_norm); qqline(sim_norm)

# Even better than comparing the original plot to a single plot generated from a normal distribution is to compare it to many more plots using the following function. It shows the Q-Q plot corresponding to the original data in the top left corner, and the Q-Q plots of 8 different simulated normal data. It may be helpful to click the zoom button in the plot window.

qqnormsim(calories_Fat)

Exercise 4

Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the female heights are nearly normal? Answer: Yes, the normal probability plot for the calories from fat look similar to the plots created for the simulated data.

Exercise 5

Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution. Mcdonalds’ data on fat calories shows a right skewed unimodal distribution with outliers. The Q-Q plots from the real data shows a normal distribution. The simulated data is even more clear with more calories fat from Mcdonalds restaurant falling on the line.
Some how the simulated data shows a normal, unimodal, symmetric distribution which differs a little bit from the real data.

calories_Fat2 <- mcdonalds$cal_fat
summary(calories_Fat2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    50.0   160.0   240.0   285.6   320.0  1270.0

calories_Fat2

##  [1]   60  410  600  280  410  250  100  210  190  400  170  300  180  300   70
## [16]   50  330  190  310  130  160  200  300  160  280  200  200  240  320  180
## [31]  300  340  200  320  190  250  390  630  790 1270  100  140  240  480  960
## [46]  240  360  600   70   80  250  110  120  250   90  100  230

view(calories_Fat2)
hist(calories_Fat2)

# line fit distribution
qqnorm(calories_Fat2); qqline(calories_Fat2)

# Statistic parameters for the dairy queen, mean, standard deviation
dqmean2 <- mean(calories_Fat2)
dqsd2   <- sd(calories_Fat2)
dqmean2

## [1] 285.614

dqsd2

## [1] 220.8993

# histogram with added parameters
 hist(calories_Fat2,
 main="Mcdonalds Distributions of the amount of calories from fat ",
 xlab="Fat calories",
 xlim=c(50,1270),
 col="blue",
 freq=FALSE
 )
curve(dnorm(x,mean=dqmean2,sd=dqsd2), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram

ggplot(data = mcdonalds, aes(x = cal_fat)) +
        geom_blank() +
        geom_histogram(aes(y = ..density..)) +
        stat_function(fun = dnorm, args = c(mean = dqmean2, sd = dqsd2), col = "tomato")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hist(calories_Fat2, probability = TRUE, breaks = 10)
lines(density(calories_Fat2, adjust = 2, cut = 0), col = "Red", lwd = 2)

ggplot(data = mcdonalds, aes(sample = cal_fat)) + 
  geom_line(stat = "qq")

# line fit distribution
qqnorm(calories_Fat2); qqline(calories_Fat2)

# A useful way to address this question is to rephrase it as: what do probability plots look like for data that I know came from a normal distribution? We can answer this by simulating data from a normal distribution using rnorm.

sim_norm2 <- rnorm(n = nrow(mcdonalds), mean = dqmean2, sd = dqsd2)

hist(sim_norm2, probability = TRUE, ylim = c(0, 0.002), breaks = 25)
x <- 0:1270
y <- dnorm(x = x, mean = dqmean2, sd = dqsd2)
lines(x = x, y = y, col = "red")

qqnorm(sim_norm2);qqline(sim_norm2)

# Even better than comparing the original plot to a single plot generated from a normal distribution is to compare it to many more plots using the following function. It shows the Q-Q plot corresponding to the original data in the top left corner, and the Q-Q plots of 8 different simulated normal data. It may be helpful to click the zoom button in the plot window.

qqnormsim(calories_Fat2)

Exercise 6

Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods?

What is the probabilitiy that a randomly chosen dairy queen product has less than 10 fat calories? Answer: the probability that a randomly chosen dairy queen product has less than 10 fat calories is p = 5.47 % Answer: the simulated data gives about p = 2.38% which is acceptable.

What is the probability that a randomly chosen Mcdonalds product has more than 10 fat calories? Answer: the probability that a randomly chosen Mcdonalds product has more than 10 fat calories is p= 10.6% Answer: the simulated data gives about p = 0 % which is about 10.6 % off from real data …I think dairy queen data on fat calories is more distributed than the mcdonalds one

# for example, the question of, “What is the probability that a randomly chosen Dairy Queen product has more than 600 calories from fat?”
#If we assume that the calories from fat from Dairy Queen’s menu are normally distributed (a very close approximation is also okay),
# we can find this probability by calculating a Z score and consulting a Z table (also called a normal probability table). 
# In R, this is done in one step with the function pnorm().
# 
pnorm(q = 600, mean = dqmean, sd = dqsd) # Answer: the probability that the fat calories from the Dairy Queen restaurant is p = 1.5 %

## [1] 0.9849848

p = 1- 0.9849848
p

## [1] 0.0150152

#Assuming a normal distribution has allowed us to calculate a theoretical probability. 
#If we want to calculate the probability empirically, we simply need to determine how many observations fall above 600 then divide this number by the total sample size.

# dairy_queen %>% 
#   filter(cal_fat > 600) %>%
#   summarise(percent = n() / nrow(dairy_queen)) 
sum(dairy_queen$cal_fat > 600) / length((dairy_queen$cal_fat)) # answer: 4.76% which is slightly higher than the real data

## [1] 0.04761905

# Although the probabilities are not exactly the same, they are reasonably close. The closer that your distribution is to being normal, the more accurate the theoretical probabilities will be.


# What is the probabilitiy that a randomly chosen dairy queen product has less than 10 fat calories ? 
pnorm(q = 10, mean = dqmean, sd = dqsd)  # Answer: the probability that the fat calories from the Dairy Queen restaurant is about 10 is p = 5.47 %

## [1] 0.05472837

normalPlot(mean = dqmean, sd = dqsd, bounds = c(-700, 10), tails = FALSE)

sum(dairy_queen$cal_fat < 10) / length((dairy_queen$cal_fat)) # Answer: the simulated data gives about p = 2.38%

## [1] 0.02380952

#table(dairy_queen$cal_fat)
# What is the probability that a randomly chosen Mcdonalds product has less than 10 fat calories?

pnorm(q = 10, mean = dqmean2, sd = dqsd2) # Answer: the probability that the fat calories from the Mcdonalds restaurant is p = 10.6 %

## [1] 0.1060721

normalPlot(mean = dqmean2, sd = dqsd2, bounds = c(-1300, 10), tails = FALSE)

sum(mcdonalds$cal_fat < 10) / length((mcdonalds$cal_fat))

## [1] 0

Exercise 7

Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium? Answers: There are about 8 restaurants from this fastfood dataset, I think taco Bell shows more data distributed closest to normal for sodium.

#Let look at the frequency distribution amount restaurants
table(fastfood$restaurant)

## 
##       Arbys Burger King Chick Fil-A Dairy Queen   Mcdonalds       Sonic 
##          55          70          27          42          57          53 
##      Subway   Taco Bell 
##          96         115

Arbys <- fastfood %>%
  filter(restaurant == "Arbys")

 BurgerKing <- fastfood %>%
  filter(restaurant == "Burger King")
 view(BurgerKing)
 is.na(BurgerKing$sodium) # checking if there is a missing data in the dataset, return false = no 'NA'

##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(is.na(BurgerKing$sodium))

## [1] 0

view(BurgerKing$sodium)
 ChickFil_A <- fastfood %>%
  filter(restaurant == "Chick Fil-A")
 
Sonic <- fastfood %>%
  filter(restaurant == "Sonic")

Subway <- fastfood %>%
  filter(restaurant == "Subway")

TacoBell <- fastfood %>%
  filter(restaurant == "Taco Bell")

qqnorm(dairy_queen$sodium); qqline(dairy_queen$sodium)

qqnorm(mcdonalds$sodium); qqline(mcdonalds$sodium)

qqnorm(Arbys$sodium); qqline(Arbys$sodium)

qplot(sample = sodium, data = Arbys, stat = "qq")

## Warning: `stat` is deprecated

qqnorm(BurgerKing$sodium); qqline(BurgerKing$sodium)

#qplot(sample = sodium, data = BurgerKing, stat = "qq")

qqnorm(ChickFil_A$sodium); qqline(ChickFil_A$sodium)

#hist(ChickFil_A$sodium, probability = TRUE, breaks = 25)
#lines(density(ChickFil_A$sodium, adjust = 2, cut = 0), col = "Red", lwd = 2)

qqnorm(Sonic$sodium); qqline(Sonic$sodium)

qqnorm(Subway$sodium); qqline(Subway$sodium)

#hist(total_Carbs, probability = TRUE, breaks = 10)
#lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)

qqnorm(TacoBell$sodium); qqline(TacoBell$sodium)

#hist(total_Carbs, probability = TRUE, breaks = 10)
#lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)

Exercice 8

Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?

Answer: I don’t see any stepwise pattern for normal probability plots for sodium distributions on any restaurant. If there is any, that would be due to discrepancy or repeated data.

Exercise 9

As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings. This Taco Bell on Total carbohydrates is a right skewed, unimoal normal distribution. The normal probability plot and histogram show a lower left tail, one mode , center about the mean, but the distribution on both side are not even, so we cannot conclude a symmetric distribution

#filtering the restaurant column to only Dairy Queen
TacoBell <- fastfood %>%
  filter(restaurant == "Taco Bell")
TacoBell

## # A tibble: 115 x 17
##    restaurant item  calories cal_fat total_fat sat_fat trans_fat cholesterol
##    <chr>      <chr>    <dbl>   <dbl>     <dbl>   <dbl>     <dbl>       <dbl>
##  1 Taco Bell  1/2 ~      540     230        26       7         1          45
##  2 Taco Bell  1/2 ~      460     170        18       7         1          45
##  3 Taco Bell  7-La~      510     170        19       7         0          20
##  4 Taco Bell  Bean~      370     100        11       4         0           5
##  5 Taco Bell  Beef~      550     200        22       8         0          35
##  6 Taco Bell  Beef~      440     160        18       5         0          20
##  7 Taco Bell  Blac~      410     110        12       4         0          10
##  8 Taco Bell  Burr~      420     140        16       7         0          35
##  9 Taco Bell  Burr~      390     110        12       5         0          40
## 10 Taco Bell  Burr~      390     120        13       5         0          30
## # ... with 105 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## #   fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## #   calcium <dbl>, salad <chr>

view(TacoBell$total_carb)


# making plot : Freq histogram

total_Carbs <- TacoBell$total_carb
summary(total_Carbs)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   12.00   29.00   44.00   46.63   64.00  107.00

hist(total_Carbs)

# line fit distribution
qqnorm(total_Carbs); qqline(total_Carbs)

# Statistic parameters for the dairy queen, mean, standard deviation
dqmean9 <- mean(total_Carbs)
dqsd9   <- sd(total_Carbs)
dqmean9

## [1] 46.63478

dqsd9

## [1] 22.51835

# histogram with added parameters
 hist(total_Carbs,
 main="Distributions of the amount of Total carbohydrates from Taco Bell ",
 xlab="Total Carbohydrates",
 xlim=c(12,107),
 col="blue",
 freq=FALSE
 )
curve(dnorm(x,mean=dqmean9,sd=dqsd9), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram

# better fit curves
# hist(total_Carbs, probability = TRUE)
# lines(density(total_Carbs, cut = 0), col = "Red", lwd = 2)
hist(total_Carbs, probability = TRUE, breaks = 10)
lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)