library(tidyverse) #loading all library needed for this assignment
library(openintro)
head(fastfood)
## # A tibble: 6 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## # ... with 9 more variables: sodium <dbl>, total_carb <dbl>, fiber <dbl>,
## # sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>, calcium <dbl>,
## # salad <chr>
library(readxl)
library(readr)
library(plyr)
library(dplyr)
library(dice)
# #library(VennDiagram)
# #library(help = "dice")
# library(DBI)
# library(dbplyr)
# library(data.table)
# library(rstudioapi)
# library(RJDBC)
# library(odbc)
# library(RSQLite)
# library(rvest)
# library(stringr)
# library(readtext)
# library(ggpubr)
library(fitdistrplus)
library(ggplot2)
library(moments)
library(qualityTools)
library(normalp)
library(utils)
library(MASS)
library(qqplotr)
library(DATA606)
##
## Welcome to CUNY DATA606 Statistics and Probability for Data Analytics
## This package is designed to support this course. The text book used
## is OpenIntro Statistics, 3rd Edition. You can read this by typing
## vignette('os3') or visit www.OpenIntro.org.
##
## The getLabs() function will return a list of the labs available.
##
## The demo(package='DATA606') will list the demos that are available.
Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare? Answers: The center is about the mean with a lowest tail being on the left, there is one mode about [100, 200] which appear to be more on the right of the mean. Thus, this is unimodal (kind of looking multimodal) right skewed. and if we were to draw a line , that will look like a right skewed, the spread looks from 0 (minimum fat calories) to 670 (maximum fat calories) , so range of 670-0 = 670
fastfood
## # A tibble: 515 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## 7 Mcdonalds Chee~ 300 100 12 5 0.5 40
## 8 Mcdonalds Clas~ 510 210 24 4 0 65
## 9 Mcdonalds Doub~ 430 190 21 11 1 85
## 10 Mcdonalds Doub~ 770 400 45 21 2.5 175
## # ... with 505 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
view(fastfood)
summary(fastfood)
## restaurant item calories cal_fat
## Length:515 Length:515 Min. : 20.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 330.0 1st Qu.: 120.0
## Mode :character Mode :character Median : 490.0 Median : 210.0
## Mean : 530.9 Mean : 238.8
## 3rd Qu.: 690.0 3rd Qu.: 310.0
## Max. :2430.0 Max. :1270.0
##
## total_fat sat_fat trans_fat cholesterol
## Min. : 0.00 Min. : 0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 14.00 1st Qu.: 4.000 1st Qu.:0.000 1st Qu.: 35.00
## Median : 23.00 Median : 7.000 Median :0.000 Median : 60.00
## Mean : 26.59 Mean : 8.153 Mean :0.465 Mean : 72.46
## 3rd Qu.: 35.00 3rd Qu.:11.000 3rd Qu.:1.000 3rd Qu.: 95.00
## Max. :141.00 Max. :47.000 Max. :8.000 Max. :805.00
##
## sodium total_carb fiber sugar
## Min. : 15 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 800 1st Qu.: 28.50 1st Qu.: 2.000 1st Qu.: 3.000
## Median :1110 Median : 44.00 Median : 3.000 Median : 6.000
## Mean :1247 Mean : 45.66 Mean : 4.137 Mean : 7.262
## 3rd Qu.:1550 3rd Qu.: 57.00 3rd Qu.: 5.000 3rd Qu.: 9.000
## Max. :6080 Max. :156.00 Max. :17.000 Max. :87.000
## NA's :12
## protein vit_a vit_c calcium
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 16.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.: 8.00
## Median : 24.50 Median : 10.00 Median : 10.00 Median : 20.00
## Mean : 27.89 Mean : 18.86 Mean : 20.17 Mean : 24.85
## 3rd Qu.: 36.00 3rd Qu.: 20.00 3rd Qu.: 30.00 3rd Qu.: 30.00
## Max. :186.00 Max. :180.00 Max. :400.00 Max. :290.00
## NA's :1 NA's :214 NA's :210 NA's :210
## salad
## Length:515
## Class :character
## Mode :character
##
##
##
##
# filtering the restaurant column to only McDonalds
mcdonalds <- fastfood %>%
filter(restaurant == "Mcdonalds")
mcdonalds
## # A tibble: 57 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti~ 380 60 7 2 0 95
## 2 Mcdonalds Sing~ 840 410 45 17 1.5 130
## 3 Mcdonalds Doub~ 1130 600 67 27 3 220
## 4 Mcdonalds Gril~ 750 280 31 10 0.5 155
## 5 Mcdonalds Cris~ 920 410 45 12 0.5 120
## 6 Mcdonalds Big ~ 540 250 28 10 1 80
## 7 Mcdonalds Chee~ 300 100 12 5 0.5 40
## 8 Mcdonalds Clas~ 510 210 24 4 0 65
## 9 Mcdonalds Doub~ 430 190 21 11 1 85
## 10 Mcdonalds Doub~ 770 400 45 21 2.5 175
## # ... with 47 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
view(mcdonalds)
#filtering the restaurant column to only Dairy Queen
dairy_queen <- fastfood %>%
filter(restaurant == "Dairy Queen")
dairy_queen
## # A tibble: 42 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dairy Que~ 1/2 ~ 1000 660 74 26 2 170
## 2 Dairy Que~ 1/2 ~ 800 460 51 20 2 135
## 3 Dairy Que~ 1/4 ~ 630 330 37 13 1 95
## 4 Dairy Que~ 1/4 ~ 540 270 30 11 1 70
## 5 Dairy Que~ 1/4 ~ 570 310 35 11 1 75
## 6 Dairy Que~ Orig~ 400 160 18 9 1 65
## 7 Dairy Que~ Orig~ 630 310 34 18 2 125
## 8 Dairy Que~ 4 Pi~ 1030 480 53 9 1 80
## 9 Dairy Que~ 6 Pi~ 1260 590 66 11 1 120
## 10 Dairy Que~ Baco~ 420 240 26 11 1 60
## # ... with 32 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
view(dairy_queen)
summary(dairy_queen)
## restaurant item calories cal_fat
## Length:42 Length:42 Min. : 20.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 350.0 1st Qu.:160.0
## Mode :character Mode :character Median : 485.0 Median :220.0
## Mean : 520.2 Mean :260.5
## 3rd Qu.: 630.0 3rd Qu.:310.0
## Max. :1260.0 Max. :670.0
##
## total_fat sat_fat trans_fat cholesterol
## Min. : 0.00 Min. : 0.00 Min. :0.0000 Min. : 0.00
## 1st Qu.:18.00 1st Qu.: 5.00 1st Qu.:0.0000 1st Qu.: 41.25
## Median :24.50 Median : 9.00 Median :1.0000 Median : 60.00
## Mean :28.86 Mean :10.44 Mean :0.6786 Mean : 71.55
## 3rd Qu.:34.75 3rd Qu.:12.50 3rd Qu.:1.0000 3rd Qu.:100.00
## Max. :75.00 Max. :43.00 Max. :2.0000 Max. :180.00
##
## sodium total_carb fiber sugar
## Min. : 15.0 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 847.5 1st Qu.: 25.25 1st Qu.: 1.000 1st Qu.: 3.000
## Median :1030.0 Median : 34.00 Median : 2.000 Median : 6.000
## Mean :1181.8 Mean : 38.69 Mean : 2.833 Mean : 6.357
## 3rd Qu.:1362.5 3rd Qu.: 44.75 3rd Qu.: 3.000 3rd Qu.: 8.750
## Max. :3500.0 Max. :121.00 Max. :12.000 Max. :30.000
##
## protein vit_a vit_c calcium
## Min. : 1.00 Min. : 0 Min. : 0.00 Min. : 0.00
## 1st Qu.:17.00 1st Qu.: 9 1st Qu.: 0.00 1st Qu.: 6.00
## Median :23.00 Median :10 Median : 4.00 Median : 10.00
## Mean :24.83 Mean :14 Mean : 4.37 Mean : 16.41
## 3rd Qu.:34.00 3rd Qu.:20 3rd Qu.: 6.00 3rd Qu.: 20.00
## Max. :49.00 Max. :50 Max. :30.00 Max. :100.00
## NA's :15 NA's :15 NA's :15
## salad
## Length:42
## Class :character
## Mode :character
##
##
##
##
# making plot : Freq histogram
calories_Fat <- dairy_queen$cal_fat
summary(calories_Fat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 160.0 220.0 260.5 310.0 670.0
calories_Fat
## [1] 660 460 330 270 310 160 310 480 590 240 220 220 180 160 180 80 80 410 670
## [20] 440 140 200 160 310 240 130 430 310 430 180 180 220 280 120 270 190 170 20
## [39] 140 130 0 240
hist(calories_Fat)
# line fit distribution
qqnorm(calories_Fat); qqline(calories_Fat)
# Statistic parameters for the dairy queen, mean, standard deviation
dqmean <- mean(dairy_queen$cal_fat)
dqsd <- sd(dairy_queen$cal_fat)
dqmean
## [1] 260.4762
dqsd
## [1] 156.4851
# histogram with added parameters
hist(calories_Fat,
main="Distributions of the amount of calories from fat ",
xlab="Fat calories",
xlim=c(0,700),
col="blue",
freq=FALSE
)
curve(dnorm(x,mean=dqmean,sd=dqsd), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram
#Find max, min for histogram plot with details
is.na(calories_Fat) # checking if there is a missing data in the dataset, return false = no 'NA'
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(calories_Fat)) # file to big, checking the sum of all missing data , return = 0 , meaning no 'NA'
## [1] 0
max(calories_Fat) #summary actually gives this info
## [1] 670
min(calories_Fat)
## [1] 0
# Using a density histogram allows us to properly overlay a normal distribution curve over the histogram since the curve is a normal probability density function that also has area under the curve of 1. Frequency and density histograms both display the same exact shape; they only differ in their y-axis. You can verify this by comparing the frequency histogram you constructed earlier and the density histogram created by the commands below.
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Based on the this plot, does it appear that the data follow a nearly normal distribution? Answer: The normal Q-Q plot shows points adhering on the line…so it is a normal distribution.
# better fit curves
hist(calories_Fat, probability = TRUE)
lines(density(calories_Fat, cut = 0), col = "Red", lwd = 2)
# Eyeballing the shape of the histogram is one way to determine if the data appear to be nearly normally distributed, but it can be frustrating to decide just how close the histogram is to the curve. An alternative approach involves constructing a normal probability plot, also called a normal Q-Q plot for “quantile-quantile”.
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
# line fit distribution
qqnorm(calories_Fat); qqline(calories_Fat)
# A useful way to address this question is to rephrase it as: what do probability plots look like for data that I know came from a normal distribution? We can answer this by simulating data from a normal distribution using rnorm.
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.) Answer: Based on the Q-Q Plot most of the point fall on the line. This plot compare to the probability plot for the real data shows sglightly more fat calories adhering to the line.
hist(sim_norm, probability = TRUE, ylim = c(0, 0.003), breaks = 25)
x <- 0:700
y <- dnorm(x = x, mean = dqmean, sd = dqsd)
lines(x = x, y = y, col = "red")
qqnorm(sim_norm); qqline(sim_norm)
# Even better than comparing the original plot to a single plot generated from a normal distribution is to compare it to many more plots using the following function. It shows the Q-Q plot corresponding to the original data in the top left corner, and the Q-Q plots of 8 different simulated normal data. It may be helpful to click the zoom button in the plot window.
qqnormsim(calories_Fat)
Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the female heights are nearly normal? Answer: Yes, the normal probability plot for the calories from fat look similar to the plots created for the simulated data.
Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution. Mcdonalds’ data on fat calories shows a right skewed unimodal distribution with outliers. The Q-Q plots from the real data shows a normal distribution. The simulated data is even more clear with more calories fat from Mcdonalds restaurant falling on the line.
Some how the simulated data shows a normal, unimodal, symmetric distribution which differs a little bit from the real data.
calories_Fat2 <- mcdonalds$cal_fat
summary(calories_Fat2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.0 160.0 240.0 285.6 320.0 1270.0
calories_Fat2
## [1] 60 410 600 280 410 250 100 210 190 400 170 300 180 300 70
## [16] 50 330 190 310 130 160 200 300 160 280 200 200 240 320 180
## [31] 300 340 200 320 190 250 390 630 790 1270 100 140 240 480 960
## [46] 240 360 600 70 80 250 110 120 250 90 100 230
view(calories_Fat2)
hist(calories_Fat2)
# line fit distribution
qqnorm(calories_Fat2); qqline(calories_Fat2)
# Statistic parameters for the dairy queen, mean, standard deviation
dqmean2 <- mean(calories_Fat2)
dqsd2 <- sd(calories_Fat2)
dqmean2
## [1] 285.614
dqsd2
## [1] 220.8993
# histogram with added parameters
hist(calories_Fat2,
main="Mcdonalds Distributions of the amount of calories from fat ",
xlab="Fat calories",
xlim=c(50,1270),
col="blue",
freq=FALSE
)
curve(dnorm(x,mean=dqmean2,sd=dqsd2), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, args = c(mean = dqmean2, sd = dqsd2), col = "tomato")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist(calories_Fat2, probability = TRUE, breaks = 10)
lines(density(calories_Fat2, adjust = 2, cut = 0), col = "Red", lwd = 2)
ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_line(stat = "qq")
# line fit distribution
qqnorm(calories_Fat2); qqline(calories_Fat2)
# A useful way to address this question is to rephrase it as: what do probability plots look like for data that I know came from a normal distribution? We can answer this by simulating data from a normal distribution using rnorm.
sim_norm2 <- rnorm(n = nrow(mcdonalds), mean = dqmean2, sd = dqsd2)
hist(sim_norm2, probability = TRUE, ylim = c(0, 0.002), breaks = 25)
x <- 0:1270
y <- dnorm(x = x, mean = dqmean2, sd = dqsd2)
lines(x = x, y = y, col = "red")
qqnorm(sim_norm2);qqline(sim_norm2)
# Even better than comparing the original plot to a single plot generated from a normal distribution is to compare it to many more plots using the following function. It shows the Q-Q plot corresponding to the original data in the top left corner, and the Q-Q plots of 8 different simulated normal data. It may be helpful to click the zoom button in the plot window.
qqnormsim(calories_Fat2)
Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods?
What is the probabilitiy that a randomly chosen dairy queen product has less than 10 fat calories? Answer: the probability that a randomly chosen dairy queen product has less than 10 fat calories is p = 5.47 % Answer: the simulated data gives about p = 2.38% which is acceptable.
What is the probability that a randomly chosen Mcdonalds product has more than 10 fat calories? Answer: the probability that a randomly chosen Mcdonalds product has more than 10 fat calories is p= 10.6% Answer: the simulated data gives about p = 0 % which is about 10.6 % off from real data …I think dairy queen data on fat calories is more distributed than the mcdonalds one
# for example, the question of, “What is the probability that a randomly chosen Dairy Queen product has more than 600 calories from fat?”
#If we assume that the calories from fat from Dairy Queen’s menu are normally distributed (a very close approximation is also okay),
# we can find this probability by calculating a Z score and consulting a Z table (also called a normal probability table).
# In R, this is done in one step with the function pnorm().
#
pnorm(q = 600, mean = dqmean, sd = dqsd) # Answer: the probability that the fat calories from the Dairy Queen restaurant is p = 1.5 %
## [1] 0.9849848
p = 1- 0.9849848
p
## [1] 0.0150152
#Assuming a normal distribution has allowed us to calculate a theoretical probability.
#If we want to calculate the probability empirically, we simply need to determine how many observations fall above 600 then divide this number by the total sample size.
# dairy_queen %>%
# filter(cal_fat > 600) %>%
# summarise(percent = n() / nrow(dairy_queen))
sum(dairy_queen$cal_fat > 600) / length((dairy_queen$cal_fat)) # answer: 4.76% which is slightly higher than the real data
## [1] 0.04761905
# Although the probabilities are not exactly the same, they are reasonably close. The closer that your distribution is to being normal, the more accurate the theoretical probabilities will be.
# What is the probabilitiy that a randomly chosen dairy queen product has less than 10 fat calories ?
pnorm(q = 10, mean = dqmean, sd = dqsd) # Answer: the probability that the fat calories from the Dairy Queen restaurant is about 10 is p = 5.47 %
## [1] 0.05472837
normalPlot(mean = dqmean, sd = dqsd, bounds = c(-700, 10), tails = FALSE)
sum(dairy_queen$cal_fat < 10) / length((dairy_queen$cal_fat)) # Answer: the simulated data gives about p = 2.38%
## [1] 0.02380952
#table(dairy_queen$cal_fat)
# What is the probability that a randomly chosen Mcdonalds product has less than 10 fat calories?
pnorm(q = 10, mean = dqmean2, sd = dqsd2) # Answer: the probability that the fat calories from the Mcdonalds restaurant is p = 10.6 %
## [1] 0.1060721
normalPlot(mean = dqmean2, sd = dqsd2, bounds = c(-1300, 10), tails = FALSE)
sum(mcdonalds$cal_fat < 10) / length((mcdonalds$cal_fat))
## [1] 0
Now let’s consider some of the other variables in the dataset. Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium? Answers: There are about 8 restaurants from this fastfood dataset, I think taco Bell shows more data distributed closest to normal for sodium.
#Let look at the frequency distribution amount restaurants
table(fastfood$restaurant)
##
## Arbys Burger King Chick Fil-A Dairy Queen Mcdonalds Sonic
## 55 70 27 42 57 53
## Subway Taco Bell
## 96 115
Arbys <- fastfood %>%
filter(restaurant == "Arbys")
BurgerKing <- fastfood %>%
filter(restaurant == "Burger King")
view(BurgerKing)
is.na(BurgerKing$sodium) # checking if there is a missing data in the dataset, return false = no 'NA'
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(is.na(BurgerKing$sodium))
## [1] 0
view(BurgerKing$sodium)
ChickFil_A <- fastfood %>%
filter(restaurant == "Chick Fil-A")
Sonic <- fastfood %>%
filter(restaurant == "Sonic")
Subway <- fastfood %>%
filter(restaurant == "Subway")
TacoBell <- fastfood %>%
filter(restaurant == "Taco Bell")
qqnorm(dairy_queen$sodium); qqline(dairy_queen$sodium)
qqnorm(mcdonalds$sodium); qqline(mcdonalds$sodium)
qqnorm(Arbys$sodium); qqline(Arbys$sodium)
qplot(sample = sodium, data = Arbys, stat = "qq")
## Warning: `stat` is deprecated
qqnorm(BurgerKing$sodium); qqline(BurgerKing$sodium)
#qplot(sample = sodium, data = BurgerKing, stat = "qq")
qqnorm(ChickFil_A$sodium); qqline(ChickFil_A$sodium)
#hist(ChickFil_A$sodium, probability = TRUE, breaks = 25)
#lines(density(ChickFil_A$sodium, adjust = 2, cut = 0), col = "Red", lwd = 2)
qqnorm(Sonic$sodium); qqline(Sonic$sodium)
qqnorm(Subway$sodium); qqline(Subway$sodium)
#hist(total_Carbs, probability = TRUE, breaks = 10)
#lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)
qqnorm(TacoBell$sodium); qqline(TacoBell$sodium)
#hist(total_Carbs, probability = TRUE, breaks = 10)
#lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)
Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?
Answer: I don’t see any stepwise pattern for normal probability plots for sodium distributions on any restaurant. If there is any, that would be due to discrepancy or repeated data.
As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings. This Taco Bell on Total carbohydrates is a right skewed, unimoal normal distribution. The normal probability plot and histogram show a lower left tail, one mode , center about the mean, but the distribution on both side are not even, so we cannot conclude a symmetric distribution
#filtering the restaurant column to only Dairy Queen
TacoBell <- fastfood %>%
filter(restaurant == "Taco Bell")
TacoBell
## # A tibble: 115 x 17
## restaurant item calories cal_fat total_fat sat_fat trans_fat cholesterol
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Taco Bell 1/2 ~ 540 230 26 7 1 45
## 2 Taco Bell 1/2 ~ 460 170 18 7 1 45
## 3 Taco Bell 7-La~ 510 170 19 7 0 20
## 4 Taco Bell Bean~ 370 100 11 4 0 5
## 5 Taco Bell Beef~ 550 200 22 8 0 35
## 6 Taco Bell Beef~ 440 160 18 5 0 20
## 7 Taco Bell Blac~ 410 110 12 4 0 10
## 8 Taco Bell Burr~ 420 140 16 7 0 35
## 9 Taco Bell Burr~ 390 110 12 5 0 40
## 10 Taco Bell Burr~ 390 120 13 5 0 30
## # ... with 105 more rows, and 9 more variables: sodium <dbl>, total_carb <dbl>,
## # fiber <dbl>, sugar <dbl>, protein <dbl>, vit_a <dbl>, vit_c <dbl>,
## # calcium <dbl>, salad <chr>
view(TacoBell$total_carb)
# making plot : Freq histogram
total_Carbs <- TacoBell$total_carb
summary(total_Carbs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.00 29.00 44.00 46.63 64.00 107.00
hist(total_Carbs)
# line fit distribution
qqnorm(total_Carbs); qqline(total_Carbs)
# Statistic parameters for the dairy queen, mean, standard deviation
dqmean9 <- mean(total_Carbs)
dqsd9 <- sd(total_Carbs)
dqmean9
## [1] 46.63478
dqsd9
## [1] 22.51835
# histogram with added parameters
hist(total_Carbs,
main="Distributions of the amount of Total carbohydrates from Taco Bell ",
xlab="Total Carbohydrates",
xlim=c(12,107),
col="blue",
freq=FALSE
)
curve(dnorm(x,mean=dqmean9,sd=dqsd9), add=TRUE, lwd = 2, col="red") # lwd = thickness of the line. Wondering if this curve fit well the histogram
# better fit curves
# hist(total_Carbs, probability = TRUE)
# lines(density(total_Carbs, cut = 0), col = "Red", lwd = 2)
hist(total_Carbs, probability = TRUE, breaks = 10)
lines(density(total_Carbs, adjust = 2, cut = 0), col = "Red", lwd = 2)