data(iris)

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

DESCRIPTION: The data was taken from the R studio dataset. It has 150 observations, 3 numerical and 1 categorical variable, which are:

Sepal.Length: The length of the sepal, in centimeters.

Sepal.Width: The width of the sepal, in centimeters.

Petal.Length: The length of the petal, in centimeters.

Petal.Width: The width of the petal, in centimeters.

Species: The species of the iris flower, which can be setosa, versicolor, or virginica.

#1. INDEPENDENT T-TEST/WILCOXON RUNK SUM TEST

NORMALITY CHECK:

Ho: Distribution of variables Sepal.Lenght is normal for both species of Iris (versicolor & virginica).

H1: Distribution of variables Sepal.Lenght is not normal for both species of Iris (Versicolor & virginica).

shapiro.test(iris$Sepal.Length[iris$Species == "versicolor"])
## 
##  Shapiro-Wilk normality test
## 
## data:  iris$Sepal.Length[iris$Species == "versicolor"]
## W = 0.97784, p-value = 0.4647
shapiro.test(iris$Sepal.Length[iris$Species == "virginica"])
## 
##  Shapiro-Wilk normality test
## 
## data:  iris$Sepal.Length[iris$Species == "virginica"]
## W = 0.97118, p-value = 0.2583

Given the results of the shapiro test we cannot reject the null hypothesis (since p-values are above the 0.05 threshold).

VARIANCES CHECK:

Ho: Variances are the same.

H1: Variances are not the same.

library(car)
## Loading required package: carData
leveneTest(iris$Sepal.Length ~ iris$Species)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   2  6.3527 0.002259 **
##       147                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Given the results of the Levene’s test we reject the null hypothesis at p=0.002.

ASSUMPTIONS ARE VIOLATED=> NON-PARAMETRIC TEST:

Ho: True location shift is equal to 0.

H1: True location shift is not equal to 0.

wilcox.test(iris$Sepal.Length[iris$Species == "versicolor"] , iris$Sepal.Length[iris$Species == "virginica"] )
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  iris$Sepal.Length[iris$Species == "versicolor"] and iris$Sepal.Length[iris$Species == "virginica"]
## W = 526, p-value = 5.869e-07
## alternative hypothesis: true location shift is not equal to 0

Given the Wilcoxon rank sum test we can reject the null hypthesis at p=0.001.

#2. ONE-WAY ANOVA

iris$SpeciesF <- factor(iris$Species,
                        labels= c("setosa", "versicolor", "virginica"),
                        levels= c("setosa", "versicolor", "virginica"))

NORMALITY CHECK:

Ho: Distribution of Petal length is normally distributed.

H1: Distribution of Petal length is not normaly distributed.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following objects are masked from 'package:effectsize':
## 
##     cohens_d, eta_squared
## The following object is masked from 'package:stats':
## 
##     filter
iris %>% 
  group_by(SpeciesF) %>% 
  shapiro_test(Petal.Length)
## # A tibble: 3 × 4
##   SpeciesF   variable     statistic      p
##   <fct>      <chr>            <dbl>  <dbl>
## 1 setosa     Petal.Length     0.955 0.0548
## 2 versicolor Petal.Length     0.966 0.158 
## 3 virginica  Petal.Length     0.962 0.110

Given the Shapiro-Wilk test, we cannot reject the null hypothesis in all variables.

Ho:Variances are the same.

H1: Variances are not the same.

library(car)
leveneTest(iris$Petal.Length, group= iris$SpeciesF)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   2   19.48 3.129e-08 ***
##       147                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

P value is lower than 0.05 therefore we reject the null hypothesis, variances are not the same.

#install.packages("onewaytests")
library(onewaytests)
welch.test(Petal.Length ~ SpeciesF, data=iris)
## 
##   Welch's Heteroscedastic F Test (alpha = 0.05) 
## ------------------------------------------------------------- 
##   data : Petal.Length and SpeciesF 
## 
##   statistic  : 1828.092 
##   num df     : 2 
##   denom df   : 78.07296 
##   p.value    : 2.693327e-66 
## 
##   Result     : Difference is statistically significant. 
## -------------------------------------------------------------
pairwise.t.test(x=iris$Petal.Length, g= iris$SpeciesF,
                p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  iris$Petal.Length and iris$SpeciesF 
## 
##            setosa versicolor
## versicolor <2e-16 -         
## virginica  <2e-16 <2e-16    
## 
## P value adjustment method: bonferroni

Every variable is different, since the p value is less than 0.05.

iris$Size <- ifelse(iris$Sepal.Length < 5, "Small",
                  ifelse(iris$Sepal.Length >= 5 & iris$Sepal.Length < 7, "Medium", "Large"))
iris$SizeF <- factor(iris$Size,
                        labels= c("Small", "Medium", "Large"),
                        levels= c("Small", "Medium", "Large"))

#3. CHI-SQUARE TEST

Ho: There is no association between size and species.

H1: There is association between two categorical variables.

results <- chisq.test(iris$SpeciesF, iris$SizeF, 
                      correct=TRUE)
## Warning in chisq.test(iris$SpeciesF, iris$SizeF, correct = TRUE): Chi-squared
## approximation may be incorrect
results
## 
##  Pearson's Chi-squared test
## 
## data:  iris$SpeciesF and iris$SizeF
## X-squared = 57.575, df = 4, p-value = 9.369e-12

P value is less than 0.001, therefore we can reject the null hypothesis.

addmargins(results$observed)
##              iris$SizeF
## iris$SpeciesF Small Medium Large Sum
##    setosa        20     30     0  50
##    versicolor     1     48     1  50
##    virginica      1     37    12  50
##    Sum           22    115    13 150
round(results$expected, 2)
##              iris$SizeF
## iris$SpeciesF Small Medium Large
##    setosa      7.33  38.33  4.33
##    versicolor  7.33  38.33  4.33
##    virginica   7.33  38.33  4.33

The assumptions are violated.

round (results$res, 2)
##              iris$SizeF
## iris$SpeciesF Small Medium Large
##    setosa      4.68  -1.35 -2.08
##    versicolor -2.34   1.56 -1.60
##    virginica  -2.34  -0.22  3.68
addmargins(round(prop.table(results$observed) , 3))
##              iris$SizeF
## iris$SpeciesF Small Medium Large   Sum
##    setosa     0.133  0.200 0.000 0.333
##    versicolor 0.007  0.320 0.007 0.334
##    virginica  0.007  0.247 0.080 0.334
##    Sum        0.147  0.767 0.087 1.001
addmargins(round(prop.table(results$observed, 1), 3), 2)
##              iris$SizeF
## iris$SpeciesF Small Medium Large  Sum
##    setosa      0.40   0.60  0.00 1.00
##    versicolor  0.02   0.96  0.02 1.00
##    virginica   0.02   0.74  0.24 1.00
addmargins(round(prop.table(results$observed, 2), 3), 1)
##              iris$SizeF
## iris$SpeciesF Small Medium Large
##    setosa     0.909  0.261 0.000
##    versicolor 0.045  0.417 0.077
##    virginica  0.045  0.322 0.923
##    Sum        0.999  1.000 1.000
fisher.test(iris$SizeF, iris$SpeciesF)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  iris$SizeF and iris$SpeciesF
## p-value = 3.538e-12
## alternative hypothesis: two.sided

We can reject the null hypothesis at p=0.001.