library(knitr )
library(psych)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(pastecs)
##
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
##
## extract
## The following objects are masked from 'package:dplyr':
##
## first, last
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
##
## is.discrete, summarize
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:psych':
##
## logit
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(lm.beta)
library(naniar)
library(Hmisc)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following objects are masked from 'package:plyr':
##
## desc, mutate
## The following object is masked from 'package:stats':
##
## filter
library(effectsize)
##
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
##
## cohens_d, eta_squared
## The following object is masked from 'package:psych':
##
## phi
library(stringr)
mydata1 <- read.table("~/Downloads/MVA_2022_2023/Dataseti/white-castle-menu.csv",
header=TRUE,
sep=",",
dec=",")
mydata2 <- mydata1[ , c(-4,-7,-10,-11,-13,-14 ) ]
colnames(mydata2) <- c("Item", "Category", "Calories", "FatInGrams","SaturatedFatsInGrams", "CholesterolInMiligrams", "SodiumInMiligrams", "SugarsInGrams ")
head(mydata2)
## Item Category Calories FatInGrams
## 1 The Original Slider® Sliders 140.0 7.0
## 2 The Original Slider® (NY/ NJ) Sliders 150.0 7.0
## 3 Double Original Slider® Sliders 250.0 13.0
## 4 Double Original Slider® (NY/ NJ) Sliders 260.0 13.0
## 5 Cheese Sliders 170.0 9.0
## 6 Cheese (NY/ NJ) Sliders 170.0 9.0
## SaturatedFatsInGrams CholesterolInMiligrams SodiumInMiligrams SugarsInGrams
## 1 2.5 10.0 380.0 2.0
## 2 2.5 10.0 420.0 3.0
## 3 4.5 20.0 690.0 3.0
## 4 4.5 20.0 770.0 5.0
## 5 4.0 15.0 510.0 2.0
## 6 4.0 15.0 550.0 3.0
Unit of observation is a product offered by White Castle Sample size is the same as the number of observation due to the number of them (95)
Item is a specific product offered by the White Castle Category states if the product is a slider or a breakfast Calories shows the calories a specific product has Fat in grams shows how many grams of fat a product has Saturated fats in grams shows how many grams of saturated fats a specific product has Cholesterol in miligrams shows how many miligrams of cholesterol a product has Sodium in miligrams shows how many miligrams of sodium a product has Sugar in grams shows how much grams of sugar a product has
Hypothesis: White Castle produtc has same average of Saturated Fats In Grams and Fats In Grams.
H0: average FatInGrams = average SaturatedFatsInGrams H1: average FatInGrams ≠ averageSaturatedFatsInGrams
mydata3 <- transform(mydata2, FatInGrams=as.numeric(FatInGrams), SaturatedFatsInGrams=as.numeric(SaturatedFatsInGrams))
mydata3$Difference <- mydata3$FatInGrams - mydata3$SaturatedFatsInGrams
library(ggplot2)
ggplot(mydata3, aes(x = Difference)) +
geom_histogram(binwidth = 2, fill = "black", color = "orange") +
xlab("Differences")
From the visual representation it is visible that it is not normally
distributed. This is why I will also use shapiro wilk normality
test.
shapiro.test(mydata3$Difference)
##
## Shapiro-Wilk normality test
##
## data: mydata3$Difference
## W = 0.92989, p-value = 7.538e-05
H0: Distribution of saturated fats in grams and fats in grams is normal H1: Distribution of saturated fats in grams and fats in grams is not normal
From the test we can see that p value is lower than five percent. This leads to us rejecting the null hypothesis at p<0.001 Furthermore it leads to us having to do a non-parametrical test. I will use wilcoxon signed rank test
wilcox.test(mydata3$FatInGrams, mydata3$SaturatedFatsInGrams,
paired = TRUE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon signed rank test
##
## data: mydata3$FatInGrams and mydata3$SaturatedFatsInGrams
## V = 4560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
From the wilcoxon signed rank test we can see that the p value is smaller than 5%. This is why we reject the null hypothesis at p<0.001 and we accept the H1 hypothesis.
H0:true location shift is equal to 0
H1:true location shift is not equal to 0
HO: Arithmetic mean from Sliders is the same as arithmetic mean from Breakfast H1: Arithmetic mean from Sliders is not the same as arithmetic mean from Breakfast
mydata3 <- transform(mydata2, Calories=as.numeric(Calories))
sliders <- mydata3[mydata3$Category == "Sliders",]
breakfasts <- mydata3[mydata3$Category == "Breakfast",]
describeBy(x = mydata3$Calories, group = mydata3$Category)
##
## Descriptive statistics by group
## group: Breakfast
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 45 335.11 102.88 350 338.11 118.61 150 490 340 -0.24 -0.98
## se
## X1 15.34
## ------------------------------------------------------------
## group: Sliders
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 50 286 123.19 230 267.25 81.54 140 570 430 1.25 0.42 17.42
Here we did some sample statistics for Slider and Breakfast products. We found out that the breakfast mean was approx. 335 and slider mean was 286.
If we want to draw some conclusion on population we need to do a formal test. But we need certain assumptions.
Firstly we need to check the normality. I will do that with the help of visual representation and shapiro test. I will also use 2 hyptheses
H0: Distribution of calories is the same H1: Distribution calories is different
ggplot(mydata3, aes(x = Calories))+
geom_histogram(binwidth = 25, fill = "black", colour = "orange")+
xlab("Calories")+
ylab("Frequency") +
facet_wrap(~Category, ncol = 1)
mydata3 %>%
group_by(Category) %>%
shapiro_test(Calories)
## # A tibble: 2 × 4
## Category variable statistic p
## <chr> <chr> <dbl> <dbl>
## 1 Breakfast Calories 0.945 0.0332
## 2 Sliders Calories 0.808 0.00000138
From the very beginning (visual representation) we aleready see that there is no normal distribution. Moreover the shapiro test also mathematically proved the visual representation.
Because p value is lower than 5 percent we can reject the H0 at p<0.001 -> this brings us to the conclusion that we do not have normal distribution and we do not meet the assumptions. So we need to use the non parametric test which will be Wilcox Rank Sum test.
wilcox.test(mydata3$Calories~ mydata3$Category,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided")
##
## Wilcoxon rank sum test
##
## data: mydata3$Calories by mydata3$Category
## W = 1493, p-value = 0.006039
## alternative hypothesis: true location shift is not equal to 0
Here we will again have 2 hypotheses which we get from the above part of the function.
H0:true location shift is equal to 0
H1:true location shift is not equal to 0
What we can see here is that the p value is smaller and we can reject the H0 at p=0.006
But in order to assess how big the differences truly are (huge differences or small differences) we will also test the effect size.
effectsize(wilcox.test(mydata3$Calories ~ mydata3$Category,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## --------------------------------
## 0.33 | [0.11, 0.52]
interpret_rank_biserial(0.33)
## [1] "large"
## (Rules: funder2019)
The effect size can be interpreted as “large”
This is why we can conclude that based on the sample data we see that Slider and Breakfast products differ in the Calories that they contain (p<0.001). Moreover from effect size we saw that Breakfast products have more calories than Slider products and that the difference in calories is large (r = 0.33)