library(knitr )
library(psych)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(lm.beta)
library(naniar)
library(Hmisc)
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following objects are masked from 'package:plyr':
## 
##     desc, mutate
## The following object is masked from 'package:stats':
## 
##     filter
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:psych':
## 
##     phi
library(stringr)

Homework 2

mydata1 <- read.table("~/Downloads/MVA_2022_2023/Dataseti/white-castle-menu.csv", 
                     header=TRUE, 
                     sep=",", 
                     dec=",")

mydata2 <- mydata1[ , c(-4,-7,-10,-11,-13,-14 ) ]

colnames(mydata2) <- c("Item", "Category", "Calories", "FatInGrams","SaturatedFatsInGrams", "CholesterolInMiligrams", "SodiumInMiligrams", "SugarsInGrams ") 
head(mydata2)
##                               Item Category Calories FatInGrams
## 1             The Original Slider®  Sliders    140.0        7.0
## 2    The Original Slider® (NY/ NJ)  Sliders    150.0        7.0
## 3          Double Original Slider®  Sliders    250.0       13.0
## 4 Double Original Slider® (NY/ NJ)  Sliders    260.0       13.0
## 5                           Cheese  Sliders    170.0        9.0
## 6                  Cheese (NY/ NJ)  Sliders    170.0        9.0
##   SaturatedFatsInGrams CholesterolInMiligrams SodiumInMiligrams SugarsInGrams 
## 1                  2.5                   10.0             380.0            2.0
## 2                  2.5                   10.0             420.0            3.0
## 3                  4.5                   20.0             690.0            3.0
## 4                  4.5                   20.0             770.0            5.0
## 5                  4.0                   15.0             510.0            2.0
## 6                  4.0                   15.0             550.0            3.0

Unit of observation is a product offered by White Castle Sample size is the same as the number of observation due to the number of them (95)

Item is a specific product offered by the White Castle Category states if the product is a slider or a breakfast Calories shows the calories a specific product has Fat in grams shows how many grams of fat a product has Saturated fats in grams shows how many grams of saturated fats a specific product has Cholesterol in miligrams shows how many miligrams of cholesterol a product has Sodium in miligrams shows how many miligrams of sodium a product has Sugar in grams shows how much grams of sugar a product has

3: Main goals

3.1 At first I want to check the Saturated Fats in Grams and Fats in grams for White Castle products. In order to assess that I need to check the average saturated fats in grams and fats in grams for each product and if they are different or not.

Hypothesis: White Castle produtc has same average of Saturated Fats In Grams and Fats In Grams.

H0: average FatInGrams = average SaturatedFatsInGrams H1: average FatInGrams ≠ averageSaturatedFatsInGrams

mydata3 <- transform(mydata2, FatInGrams=as.numeric(FatInGrams), SaturatedFatsInGrams=as.numeric(SaturatedFatsInGrams))

mydata3$Difference <- mydata3$FatInGrams - mydata3$SaturatedFatsInGrams

library(ggplot2)
ggplot(mydata3, aes(x = Difference)) +
  geom_histogram(binwidth = 2, fill = "black", color = "orange") +
  xlab("Differences")

From the visual representation it is visible that it is not normally distributed. This is why I will also use shapiro wilk normality test.

shapiro.test(mydata3$Difference)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata3$Difference
## W = 0.92989, p-value = 7.538e-05

H0: Distribution of saturated fats in grams and fats in grams is normal H1: Distribution of saturated fats in grams and fats in grams is not normal

From the test we can see that p value is lower than five percent. This leads to us rejecting the null hypothesis at p<0.001 Furthermore it leads to us having to do a non-parametrical test. I will use wilcoxon signed rank test

wilcox.test(mydata3$FatInGrams,  mydata3$SaturatedFatsInGrams,
            paired = TRUE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon signed rank test
## 
## data:  mydata3$FatInGrams and mydata3$SaturatedFatsInGrams
## V = 4560, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

From the wilcoxon signed rank test we can see that the p value is smaller than 5%. This is why we reject the null hypothesis at p<0.001 and we accept the H1 hypothesis.

H0:true location shift is equal to 0

H1:true location shift is not equal to 0

3.2 For the second I want to know if there are any diferences between Slider and Breakfast products and the Calories they contain.

HO: Arithmetic mean from Sliders is the same as arithmetic mean from Breakfast H1: Arithmetic mean from Sliders is not the same as arithmetic mean from Breakfast

mydata3 <- transform(mydata2, Calories=as.numeric(Calories))

sliders <- mydata3[mydata3$Category == "Sliders",]
breakfasts <- mydata3[mydata3$Category == "Breakfast",]

describeBy(x = mydata3$Calories, group = mydata3$Category)
## 
##  Descriptive statistics by group 
## group: Breakfast
##    vars  n   mean     sd median trimmed    mad min max range  skew kurtosis
## X1    1 45 335.11 102.88    350  338.11 118.61 150 490   340 -0.24    -0.98
##       se
## X1 15.34
## ------------------------------------------------------------ 
## group: Sliders
##    vars  n mean     sd median trimmed   mad min max range skew kurtosis    se
## X1    1 50  286 123.19    230  267.25 81.54 140 570   430 1.25     0.42 17.42

Here we did some sample statistics for Slider and Breakfast products. We found out that the breakfast mean was approx. 335 and slider mean was 286.

If we want to draw some conclusion on population we need to do a formal test. But we need certain assumptions.

Firstly we need to check the normality. I will do that with the help of visual representation and shapiro test. I will also use 2 hyptheses

H0: Distribution of calories is the same H1: Distribution calories is different

ggplot(mydata3, aes(x = Calories))+
  geom_histogram(binwidth = 25, fill = "black", colour = "orange")+
  xlab("Calories")+
  ylab("Frequency") +
  facet_wrap(~Category, ncol = 1)

mydata3 %>%
  group_by(Category) %>%
  shapiro_test(Calories)
## # A tibble: 2 × 4
##   Category  variable statistic          p
##   <chr>     <chr>        <dbl>      <dbl>
## 1 Breakfast Calories     0.945 0.0332    
## 2 Sliders   Calories     0.808 0.00000138

From the very beginning (visual representation) we aleready see that there is no normal distribution. Moreover the shapiro test also mathematically proved the visual representation.

Because p value is lower than 5 percent we can reject the H0 at p<0.001 -> this brings us to the conclusion that we do not have normal distribution and we do not meet the assumptions. So we need to use the non parametric test which will be Wilcox Rank Sum test.

wilcox.test(mydata3$Calories~ mydata3$Category,
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided")
## 
##  Wilcoxon rank sum test
## 
## data:  mydata3$Calories by mydata3$Category
## W = 1493, p-value = 0.006039
## alternative hypothesis: true location shift is not equal to 0

Here we will again have 2 hypotheses which we get from the above part of the function.

H0:true location shift is equal to 0

H1:true location shift is not equal to 0

What we can see here is that the p value is smaller and we can reject the H0 at p=0.006

But in order to assess how big the differences truly are (huge differences or small differences) we will also test the effect size.

effectsize(wilcox.test(mydata3$Calories ~ mydata3$Category,
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided"))
## r (rank biserial) |       95% CI
## --------------------------------
## 0.33              | [0.11, 0.52]
interpret_rank_biserial(0.33)
## [1] "large"
## (Rules: funder2019)

The effect size can be interpreted as “large”

This is why we can conclude that based on the sample data we see that Slider and Breakfast products differ in the Calories that they contain (p<0.001). Moreover from effect size we saw that Breakfast products have more calories than Slider products and that the difference in calories is large (r = 0.33)

3.3 For the last part I want to know the proportion of breakfast products on the White Castle Menu and if it is larger than sliders.

In order to determine which test to take, we need to check the following assumptions: 1. n𝜋0 = 950,50 = 47,5 2- n(1-𝜋0) = 950,50 = 47,5

We will use the following hypotheses

H0: Proportion of Breakfast products in the Menu is 0.50 H1: proportion of Breakfast is more than 0.50

We will check the hyptheses with the help of prop test, due to both assumptions being over 5

prop.test(x = 45,
          n = 95,
          p = 0.50,
          correct = FALSE,
          alternative = "greater")
## 
##  1-sample proportions test without continuity correction
## 
## data:  45 out of 95, null probability 0.5
## X-squared = 0.26316, df = 1, p-value = 0.696
## alternative hypothesis: true p is greater than 0.5
## 95 percent confidence interval:
##  0.3913223 1.0000000
## sample estimates:
##         p 
## 0.4736842

From the above test we can see that the p value is over 0,05 which means that we do not reject the H0 at p=0,696. Furthermore from the above test we can not conclude that there is more breakfast products on the White Castle menu.