library(tidyverse)
library(openintro)
library(ggplot2)
head(fastfood)## # A tibble: 6 × 17
## restaur…¹ item calor…² cal_fat total…³ sat_fat trans…⁴ chole…⁵ sodium total…⁶
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mcdonalds Arti… 380 60 7 2 0 95 1110 44
## 2 Mcdonalds Sing… 840 410 45 17 1.5 130 1580 62
## 3 Mcdonalds Doub… 1130 600 67 27 3 220 1920 63
## 4 Mcdonalds Gril… 750 280 31 10 0.5 155 1940 62
## 5 Mcdonalds Cris… 920 410 45 12 0.5 120 1980 81
## 6 Mcdonalds Big … 540 250 28 10 1 80 950 46
## # … with 7 more variables: fiber <dbl>, sugar <dbl>, protein <dbl>,
## # vit_a <dbl>, vit_c <dbl>, calcium <dbl>, salad <chr>, and abbreviated
## # variable names ¹restaurant, ²calories, ³total_fat, ⁴trans_fat,
## # ⁵cholesterol, ⁶total_carb
glimpse(fastfood)## Rows: 515
## Columns: 17
## $ restaurant <chr> "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdonalds", "Mcdon…
## $ item <chr> "Artisan Grilled Chicken Sandwich", "Single Bacon Smokehou…
## $ calories <dbl> 380, 840, 1130, 750, 920, 540, 300, 510, 430, 770, 380, 62…
## $ cal_fat <dbl> 60, 410, 600, 280, 410, 250, 100, 210, 190, 400, 170, 300,…
## $ total_fat <dbl> 7, 45, 67, 31, 45, 28, 12, 24, 21, 45, 18, 34, 20, 34, 8, …
## $ sat_fat <dbl> 2.0, 17.0, 27.0, 10.0, 12.0, 10.0, 5.0, 4.0, 11.0, 21.0, 4…
## $ trans_fat <dbl> 0.0, 1.5, 3.0, 0.5, 0.5, 1.0, 0.5, 0.0, 1.0, 2.5, 0.0, 1.5…
## $ cholesterol <dbl> 95, 130, 220, 155, 120, 80, 40, 65, 85, 175, 40, 95, 125, …
## $ sodium <dbl> 1110, 1580, 1920, 1940, 1980, 950, 680, 1040, 1040, 1290, …
## $ total_carb <dbl> 44, 62, 63, 62, 81, 46, 33, 49, 35, 42, 38, 48, 48, 67, 31…
## $ fiber <dbl> 3, 2, 3, 2, 4, 3, 2, 3, 2, 3, 2, 3, 3, 5, 2, 2, 3, 3, 5, 2…
## $ sugar <dbl> 11, 18, 18, 18, 18, 9, 7, 6, 7, 10, 5, 11, 11, 11, 6, 3, 1…
## $ protein <dbl> 37, 46, 70, 55, 46, 25, 15, 25, 25, 51, 15, 32, 42, 33, 13…
## $ vit_a <dbl> 4, 6, 10, 6, 6, 10, 10, 0, 20, 20, 2, 10, 10, 10, 2, 4, 6,…
## $ vit_c <dbl> 20, 20, 20, 25, 20, 2, 2, 4, 4, 6, 0, 10, 20, 15, 2, 6, 15…
## $ calcium <dbl> 20, 20, 50, 20, 20, 15, 10, 2, 15, 20, 15, 35, 35, 35, 4, …
## $ salad <chr> "Other", "Other", "Other", "Other", "Other", "Other", "Oth…
summary(fastfood)## restaurant item calories cal_fat
## Length:515 Length:515 Min. : 20.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 330.0 1st Qu.: 120.0
## Mode :character Mode :character Median : 490.0 Median : 210.0
## Mean : 530.9 Mean : 238.8
## 3rd Qu.: 690.0 3rd Qu.: 310.0
## Max. :2430.0 Max. :1270.0
##
## total_fat sat_fat trans_fat cholesterol
## Min. : 0.00 Min. : 0.000 Min. :0.000 Min. : 0.00
## 1st Qu.: 14.00 1st Qu.: 4.000 1st Qu.:0.000 1st Qu.: 35.00
## Median : 23.00 Median : 7.000 Median :0.000 Median : 60.00
## Mean : 26.59 Mean : 8.153 Mean :0.465 Mean : 72.46
## 3rd Qu.: 35.00 3rd Qu.:11.000 3rd Qu.:1.000 3rd Qu.: 95.00
## Max. :141.00 Max. :47.000 Max. :8.000 Max. :805.00
##
## sodium total_carb fiber sugar
## Min. : 15 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 800 1st Qu.: 28.50 1st Qu.: 2.000 1st Qu.: 3.000
## Median :1110 Median : 44.00 Median : 3.000 Median : 6.000
## Mean :1247 Mean : 45.66 Mean : 4.137 Mean : 7.262
## 3rd Qu.:1550 3rd Qu.: 57.00 3rd Qu.: 5.000 3rd Qu.: 9.000
## Max. :6080 Max. :156.00 Max. :17.000 Max. :87.000
## NA's :12
## protein vit_a vit_c calcium
## Min. : 1.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 16.00 1st Qu.: 4.00 1st Qu.: 4.00 1st Qu.: 8.00
## Median : 24.50 Median : 10.00 Median : 10.00 Median : 20.00
## Mean : 27.89 Mean : 18.86 Mean : 20.17 Mean : 24.85
## 3rd Qu.: 36.00 3rd Qu.: 20.00 3rd Qu.: 30.00 3rd Qu.: 30.00
## Max. :186.00 Max. :180.00 Max. :400.00 Max. :290.00
## NA's :1 NA's :214 NA's :210 NA's :210
## salad
## Length:515
## Class :character
## Mode :character
##
##
##
##
Make a plot (or plots) to visualize the distributions of the amount of calories from fat of the options from these two restaurants. How do their centers, shapes, and spreads compare?
fastfood_short<- subset(fastfood, select= c(restaurant, calories, cal_fat))
fastfood_short_new <- fastfood_short %>%
filter(restaurant %in% c("Mcdonalds", "Dairy Queen"))
mcdonalds <- fastfood_short %>%
filter(restaurant == "Mcdonalds")
dairy_queen <- fastfood_short%>%
filter(restaurant == "Dairy Queen")
hist(mcdonalds$cal_fat)hist(dairy_queen$cal_fat)Frequency distribution- heights of the bars add up to the total number of observations
ggplot (data = mcdonalds, aes(x = cal_fat))+
geom_bar(width = 16)## Warning: `position_stack()` requires non-overlapping x intervals
ggplot (data = dairy_queen, aes(x = cal_fat))+
geom_bar(width = 16)## Warning: `position_stack()` requires non-overlapping x intervals
ggplot(data= fastfood_short_new, aes(x = cal_fat, y=, fill = restaurant)) +
geom_bar(width = 16)## Warning: `position_stack()` requires non-overlapping x intervals
While both have means around the same area, Dairy Queens has a more
narrow curve and Mcdonalds has more of a skew to the right, meaning it
has more high-calorie items than Dairy Queen. Dairy Queen has a more
normal curve.
dqmean <- mean(dairy_queen$cal_fat)
dqsd<- sd(dairy_queen$cal_fat)
mcmean <- mean(mcdonalds$cal_fat)
mcsd<- sd(mcdonalds$cal_fat)
dqmean## [1] 260.4762
dqsd## [1] 156.4851
mcmean## [1] 285.614
mcsd## [1] 220.8993
Density Histogram- areas of the bars add up to 1
ggplot(data = dairy_queen, aes(x = cal_fat)) +
geom_blank() +
geom_histogram(aes(y = ..density..), binwidth = 20) +
stat_function(fun = dnorm, args = c(mean = dqmean, sd = dqsd), col = "tomato")## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
#dnorm is density of normal curve function, specifying that want the curve to have the same m and sd as the column of cal_fat
ggplot(data = mcdonalds, aes(x = cal_fat)) +
geom_blank() + ##initializes a blank plot
geom_histogram(aes(y = ..density..), binwidth = 20) +
stat_function(fun = dnorm, args = c(mean = mcmean, sd = mcsd), col = "green")Both curves are normally distributed but Dairy Queen is more of a standard normal curve while Mcdonalds’ curve is right-skewed.
Make a normal probability plot of sim_norm. Do all of the points fall on the line? How does this plot compare to the probability plot for the real data? (Since sim_norm is not a dataframe, it can be put directly into the sample argument and the data argument can be dropped.)
Normal proabibility plot or Q-Q plot (quantile - quantile)
ggplot(data = dairy_queen, aes(sample = cal_fat)) +
geom_line(stat = "qq")
Close to diagonal line, waves at the ends show some skewness
ggplot(data = mcdonalds, aes(sample = cal_fat)) +
geom_line(stat = "qq")
Not as close to the diagonal, it deviates more and shows a lot more
skewness
What do probability plots look like for data that I know came from a normal distribution?
sim_norm <- rnorm(n = nrow(dairy_queen), mean = dqmean, sd = dqsd)
#n is how many numbers you want to generate - same number of menu items in the dq dataset
sim_norm #makng simulated data given parameters## [1] 275.66939 500.41939 232.54665 -45.89936 197.76868 477.12807
## [7] 580.17816 308.56847 393.35349 359.66085 44.04741 393.06487
## [13] 382.53389 65.86187 466.94803 140.00231 502.70394 485.79154
## [19] 236.29400 273.87460 464.91462 336.19600 347.03761 202.65942
## [25] 409.26349 672.68474 97.00805 186.01757 618.57195 297.63477
## [31] 504.21865 97.93330 500.55101 -104.77505 82.77431 261.91415
## [37] 200.78407 398.99426 485.16696 157.17813 219.25026 89.06227
normal probability plots based on each sample created, first is actual sample
qqnormsim(sample = cal_fat, data = dairy_queen)
All of them look fairly straight, with a few outliers, so it’s very
close to normal distribution
Plotting dairy queen data and sample from normal distributions
#create colors for legend
col = c("Simulation" = "green", "Dairy Queen" = "red")
#plot dairy queen data and sample from normal distribution
ggplot()+
geom_line(data = dairy_queen, aes(sample = cal_fat, color = "Dairy Queen"), stat = "qq")+
#geom_line(aes(sample = sim_norm, color= "Simulation", stat = "qq"))+
theme_classic()+
labs(color = "Legend")+
scale_color_manual(values = col)ggplot(, aes(sample = sim_norm, color = "Simulation"))+
geom_line(stat = "qq")ggplot(data = dairy_queen, aes(sample = cal_fat, color = "Dairy Queen"))+
geom_line(stat = "qq")Does the normal probability plot for the calories from fat look similar to the plots created for the simulated data? That is, do the plots provide evidence that the calories from fat are nearly normal?
Yes, the two are fairly close to each other, providing evidence that the calories from fat are nearly normal.
Using the same technique, determine whether or not the calories from McDonald’s menu appear to come from a normal distribution.
Simulated data
sim_norm2 <- rnorm(n = nrow(mcdonalds), mean = mcmean, sd = mcsd)
sim_norm2## [1] 249.831220 308.770289 50.816205 196.550232 656.881015 238.400449
## [7] 148.529039 48.775647 264.026778 284.106897 540.223214 -69.682067
## [13] 231.618779 474.643637 -25.922678 333.002027 44.096180 42.742519
## [19] 520.697014 438.750292 397.165310 403.130463 438.137816 535.892931
## [25] 84.394840 293.871354 490.943858 563.288776 44.937298 337.736413
## [31] -2.165098 622.099810 460.717433 35.359130 -115.344097 -88.260426
## [37] 566.115811 203.392374 460.359765 387.637232 307.428947 599.946834
## [43] 233.004670 352.260431 228.514933 59.514429 -15.047228 604.758719
## [49] 624.447771 193.460795 514.822414 198.495553 325.779116 355.320473
## [55] 197.925377 99.995363 397.884103
Q-Q plots based on each sample
qqnormsim(sample = cal_fat, data = mcdonalds)
All of the simulated plots look near straight while the plot based on
the sample data has a distinct curve that isn’t seen in the simulated
plots. The calories from the Mcdonald’s menu do not appear to come from
a normal distribution
What is the probability that a randomly chosen Dairy Queen product has more than 600 calories from fat?
1 - pnorm(q = 600, mean = dqmean, sd = dqsd)## [1] 0.01501523
#calculates Z score and consults Z table, giving area under the normal curve below given value q1.5% chance of a menu item having more than 600 calories
Calculate probability empirically
dairy_queen%>%
filter(cal_fat > 600)%>% #determining how many obs fall above 600
summarize(percent = n()/nrow(dairy_queen))#dividing # of obs over 600 by total sample size## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.0476
Closer the distribution is to normal, the more accurate theoretical probabilities will be
Write out two probability questions that you would like to answer about any of the restaurants in this dataset. Calculate those probabilities using both the theoretical normal distribution as well as the empirical distribution (four probabilities in all). Which one had a closer agreement between the two methods?
hist(mcdonalds$calories)cal_mean<- mean(mcdonalds$calories)
cal_sd<- sd(mcdonalds$calories)
cal_mean## [1] 640.3509
cal_sd## [1] 410.6961
pnorm(q = 300, mean = cal_mean, sd = cal_sd)## [1] 0.2036323
There is a 20% chance of picking a menu item that has 300 calories or less
mcdonalds%>%
filter(calories <= 300)%>%
summarize(percent = n()/nrow(mcdonalds))## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.158
There is a 15% chance of picking a menu item that has 300 calories or less
arbys <- fastfood%>%
select(restaurant, protein)%>%
filter(restaurant == "Arbys")
head(arbys)## # A tibble: 6 × 2
## restaurant protein
## <chr> <dbl>
## 1 Arbys 18
## 2 Arbys 18
## 3 Arbys 23
## 4 Arbys 39
## 5 Arbys 38
## 6 Arbys 38
hist(arbys$protein)mean(arbys$protein)## [1] 29.25455
amean <- mean(arbys$protein)
asd<- sd(arbys$protein)
amean## [1] 29.25455
asd## [1] 12.3861
arbys%>%
filter(protein > 40)%>%
summarize(percent = n()/nrow(arbys))## # A tibble: 1 × 1
## percent
## <dbl>
## 1 0.182
There is an 18% chance of picking a menu item with more than 40g of protein
1-pnorm(q = 40, mean = amean, sd = asd)## [1] 0.1928227
There is an 19% chance of picking a menu item with more than 40g of protein
The question about Arby’s protein had a closer agreement between the two methods, probably because the distribution was closer to a normal curve
###Exercise 7 Out of all the different restaurants, which ones’ distribution is the closest to normal for sodium?
sodium <- fastfood%>%
select(restaurant, sodium)
ggplot(data = sodium, aes(x = sodium))+
facet_wrap(~restaurant)+
geom_blank()+
geom_histogram(aes( y = ..density..))+
stat_function(fun = dnorm, args = c(mean = mean(sodium$sodium), sd = sd(sodium$sodium)), col = "tomato")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Subway’s distribution is the closest to normal for sodium.
Note that some of the normal probability plots for sodium distributions seem to have a stepwise pattern. why do you think this might be the case?
It may not be as smooth as the normal curve because the data is composed of discrete, whole numbers/grams of sodium
As you can see, normal probability plots can be used both to assess normality and visualize skewness. Make a normal probability plot for the total carbohydrates from a restaurant of your choice. Based on this normal probability plot, is this variable left skewed, symmetric, or right skewed? Use a histogram to confirm your findings.
bk <-fastfood%>%
select(restaurant, total_carb)%>%
filter(restaurant == "Burger King")
head(bk)## # A tibble: 6 × 2
## restaurant total_carb
## <chr> <dbl>
## 1 Burger King 21
## 2 Burger King 48
## 3 Burger King 32
## 4 Burger King 28
## 5 Burger King 48
## 6 Burger King 63
ggplot(data = bk, aes(sample = total_carb)) +
geom_line(stat = "qq")
This plot is normally distributed with a slight left skew
hist(bk$total_carb)ggplot(data = bk, aes(x = total_carb, y = ))+
geom_bar()#Tidyverse
cancer_reg <- readr::read_csv("https://raw.githubusercontent.com/Arnab777as3uj/STAT6021-Cancer-Prediction-Project/master/cancer_reg.csv")## Rows: 3047 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): binnedInc, Geography
## dbl (32): avgAnnCount, avgDeathsPerYear, TARGET_deathRate, incidenceRate, me...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(cancer_reg)## avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate
## Min. : 6.0 Min. : 3 Min. : 59.7 Min. : 201.3
## 1st Qu.: 76.0 1st Qu.: 28 1st Qu.:161.2 1st Qu.: 420.3
## Median : 171.0 Median : 61 Median :178.1 Median : 453.5
## Mean : 606.3 Mean : 186 Mean :178.7 Mean : 448.3
## 3rd Qu.: 518.0 3rd Qu.: 149 3rd Qu.:195.2 3rd Qu.: 480.9
## Max. :38150.0 Max. :14010 Max. :362.8 Max. :1206.9
##
## medIncome popEst2015 povertyPercent studyPerCap
## Min. : 22640 Min. : 827 Min. : 3.20 Min. : 0.00
## 1st Qu.: 38882 1st Qu.: 11684 1st Qu.:12.15 1st Qu.: 0.00
## Median : 45207 Median : 26643 Median :15.90 Median : 0.00
## Mean : 47063 Mean : 102637 Mean :16.88 Mean : 155.40
## 3rd Qu.: 52492 3rd Qu.: 68671 3rd Qu.:20.40 3rd Qu.: 83.65
## Max. :125635 Max. :10170292 Max. :47.40 Max. :9762.31
##
## binnedInc MedianAge MedianAgeMale MedianAgeFemale
## Length:3047 Min. : 22.30 Min. :22.40 Min. :22.30
## Class :character 1st Qu.: 37.70 1st Qu.:36.35 1st Qu.:39.10
## Mode :character Median : 41.00 Median :39.60 Median :42.40
## Mean : 45.27 Mean :39.57 Mean :42.15
## 3rd Qu.: 44.00 3rd Qu.:42.50 3rd Qu.:45.30
## Max. :624.00 Max. :64.70 Max. :65.70
##
## Geography AvgHouseholdSize PercentMarried PctNoHS18_24
## Length:3047 Min. :0.0221 Min. :23.10 Min. : 0.00
## Class :character 1st Qu.:2.3700 1st Qu.:47.75 1st Qu.:12.80
## Mode :character Median :2.5000 Median :52.40 Median :17.10
## Mean :2.4797 Mean :51.77 Mean :18.22
## 3rd Qu.:2.6300 3rd Qu.:56.40 3rd Qu.:22.70
## Max. :3.9700 Max. :72.50 Max. :64.10
##
## PctHS18_24 PctSomeCol18_24 PctBachDeg18_24 PctHS25_Over
## Min. : 0.0 Min. : 7.10 Min. : 0.000 Min. : 7.50
## 1st Qu.:29.2 1st Qu.:34.00 1st Qu.: 3.100 1st Qu.:30.40
## Median :34.7 Median :40.40 Median : 5.400 Median :35.30
## Mean :35.0 Mean :40.98 Mean : 6.158 Mean :34.80
## 3rd Qu.:40.7 3rd Qu.:46.40 3rd Qu.: 8.200 3rd Qu.:39.65
## Max. :72.5 Max. :79.00 Max. :51.800 Max. :54.80
## NA's :2285
## PctBachDeg25_Over PctEmployed16_Over PctUnemployed16_Over PctPrivateCoverage
## Min. : 2.50 Min. :17.60 Min. : 0.400 Min. :22.30
## 1st Qu.: 9.40 1st Qu.:48.60 1st Qu.: 5.500 1st Qu.:57.20
## Median :12.30 Median :54.50 Median : 7.600 Median :65.10
## Mean :13.28 Mean :54.15 Mean : 7.852 Mean :64.35
## 3rd Qu.:16.10 3rd Qu.:60.30 3rd Qu.: 9.700 3rd Qu.:72.10
## Max. :42.20 Max. :80.10 Max. :29.400 Max. :92.30
## NA's :152
## PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage
## Min. :15.70 Min. :13.5 Min. :11.20
## 1st Qu.:41.00 1st Qu.:34.5 1st Qu.:30.90
## Median :48.70 Median :41.1 Median :36.30
## Mean :48.45 Mean :41.2 Mean :36.25
## 3rd Qu.:55.60 3rd Qu.:47.7 3rd Qu.:41.55
## Max. :78.90 Max. :70.7 Max. :65.10
## NA's :609
## PctPublicCoverageAlone PctWhite PctBlack PctAsian
## Min. : 2.60 Min. : 10.20 Min. : 0.0000 Min. : 0.0000
## 1st Qu.:14.85 1st Qu.: 77.30 1st Qu.: 0.6207 1st Qu.: 0.2542
## Median :18.80 Median : 90.06 Median : 2.2476 Median : 0.5498
## Mean :19.24 Mean : 83.65 Mean : 9.1080 Mean : 1.2540
## 3rd Qu.:23.10 3rd Qu.: 95.45 3rd Qu.:10.5097 3rd Qu.: 1.2210
## Max. :46.60 Max. :100.00 Max. :85.9478 Max. :42.6194
##
## PctOtherRace PctMarriedHouseholds BirthRate
## Min. : 0.0000 Min. :22.99 Min. : 0.000
## 1st Qu.: 0.2952 1st Qu.:47.76 1st Qu.: 4.521
## Median : 0.8262 Median :51.67 Median : 5.381
## Mean : 1.9835 Mean :51.24 Mean : 5.640
## 3rd Qu.: 2.1780 3rd Qu.:55.40 3rd Qu.: 6.494
## Max. :41.9303 Max. :78.08 Max. :21.326
##
cancer_reg<- dplyr::rename(cancer_reg, "cancer_death_rate" = "TARGET_deathRate")ggplot(cancer_reg, aes(x = povertyPercent, y = cancer_death_rate))+
geom_point(alpha = 0.25)+
xlab("Poverty Percentage")+
ylab("Cancer Death Rate")+
ggtitle("Cancer Deah vs Poverty by County in US")