Loading the dataset

# Load the iris dataset
data("iris")
# Preview the dataset
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Data set explanation

Hypothesis Testing (Sepal.Lenght by Species) Research Question: Is there a significant difference in Sepal.Lenght between two species (setosa and versicolor)?

Data Preparation

# Filter data for two species
 ## install.packages("dplyr")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mydata2 <- iris %>%
  filter(Species %in% c("setosa", "versicolor"))
print(head(mydata2))
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Normality Test

setosa_sepal <- mydata2$Sepal.Length[mydata2$Species == "setosa"]
versicolor_sepal <- mydata2$Sepal.Length[mydata2$Species == "versicolor"]

shapiro.test(setosa_sepal)
## 
##  Shapiro-Wilk normality test
## 
## data:  setosa_sepal
## W = 0.9777, p-value = 0.4595
shapiro.test(versicolor_sepal)
## 
##  Shapiro-Wilk normality test
## 
## data:  versicolor_sepal
## W = 0.97784, p-value = 0.4647
# Histogram for setosa
ggplot(subset(mydata2, Species == "setosa"), aes(x = Sepal.Length)) +
  geom_histogram(binwidth = 0.5, color = "black", fill = "dodgerblue") +
  theme_minimal() +
  xlab("Sepal Length") +
  ylab("Frequency") +
  labs(title = "Histogram of Sepal.Length for Setosa")

# Histogram for versicolor
ggplot(subset(mydata2, Species == "versicolor"), aes(x = Sepal.Length)) +
  geom_histogram(binwidth = 0.5, color = "black", fill = "tomato") +
  theme_minimal() +
  xlab("Sepal Length") +
  ylab("Frequency") +
  labs(title = "Histogram of Sepal.Length for Versicolor")

# install psych 
## install.packages("psych")
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe.by(mydata2$Sepal.Length , mydata2$Species, mat=TRUE)
## Warning in describe.by(mydata2$Sepal.Length, mydata2$Species, mat = TRUE):
## describe.by is deprecated.  Please use the describeBy function
##     item     group1 vars  n  mean        sd median trimmed     mad min max
## X11    1     setosa    1 50 5.006 0.3524897    5.0  5.0025 0.29652 4.3 5.8
## X12    2 versicolor    1 50 5.936 0.5161711    5.9  5.9375 0.51891 4.9 7.0
## X13    3  virginica   NA NA    NA        NA     NA      NA      NA  NA  NA
##     range       skew   kurtosis         se
## X11   1.5 0.11297784 -0.4508724 0.04984957
## X12   2.1 0.09913926 -0.6939138 0.07299762
## X13    NA         NA         NA         NA

So the variances are not equal, Welch correction needed.

t.test(Sepal.Length ~ Species,
       data = mydata2,
       var.equal= FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  Sepal.Length by Species
## t = -10.521, df = 86.538, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group setosa and group versicolor is not equal to 0
## 95 percent confidence interval:
##  -1.1057074 -0.7542926
## sample estimates:
##     mean in group setosa mean in group versicolor 
##                    5.006                    5.936

So we reject null hypothesis that the means are equal at p<0.001. need to calculate the effect size now.

## install.packages("effectsize")
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize :: cohens_d(Sepal.Length ~ Species,
                       data=mydata2)
## Cohen's d |         95% CI
## --------------------------
## -2.10     | [-2.59, -1.61]
## 
## - Estimated using pooled SD.
interpret_cohens_d(2.10 , rules = "sawilowsky2009")
## [1] "huge"
## (Rules: sawilowsky2009)

Now. just in case performing non-parametric test. Wilcoxon Rank Sum test.

wilcox.test(Sepal.Length ~ Species, 
            data= mydata2)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Sepal.Length by Species
## W = 168.5, p-value = 8.346e-14
## alternative hypothesis: true location shift is not equal to 0

There is a significant difference in loation distribution of Sepal.Lenght between setosa and versicolor. The non parametric test confirms the findings of the parametric test. We choose parametric test because the assumptions of numerical values and normality were met.

Task 2: Research Queston 2: Is there a significant correlation between Sepal.Length and Petal.Length in the dataset?

Testing Normaity assumption

shapiro_sepal <- shapiro.test(mydata2$Sepal.Length)
shapiro_petal <- shapiro.test(mydata2$Petal.Length)

shapiro_petal
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata2$Petal.Length
## W = 0.80185, p-value = 2.824e-10
shapiro_sepal
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata2$Sepal.Length
## W = 0.96964, p-value = 0.02076

So based on the p-values we conclude that both variables are. not normally distributed; we should use Speraman’s rank correlation coefficient, which compared to Pearsn corellation des not require the normality assumption.

Scatterplot Matrix

##install.packages("car")
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
scatterplotMatrix(mydata2[ , c("Sepal.Length", "Petal.Length")], smooth = FALSE)

##install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(mydata2[, c("Sepal.Length","Petal.Length")])

install.packages("Hmisc")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

I couldn’t install packages “Hmisc” so I found this way of writing it differently.

# Load necessary libraries
library(dplyr)

# Prepare data (filtering for setosa and versicolor as mydata2)
mydata2 <- iris %>% filter(Species %in% c("setosa", "versicolor"))

# Perform Spearman correlation test
spearman_test <- cor.test(mydata2$Sepal.Length, mydata2$Petal.Length, method = "spearman")
## Warning in cor.test.default(mydata2$Sepal.Length, mydata2$Petal.Length, :
## Cannot compute exact p-value with ties
# Display the result
spearman_test
## 
##  Spearman's rank correlation rho
## 
## data:  mydata2$Sepal.Length and mydata2$Petal.Length
## S = 30740, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8155428

Sperman’s rho is 0.8155428 which indicates strong positive correlationbetween Sepal.Length and Petal.Length fo the two species (setosa and versicolor), p-value is <0.001 which is much smaller the 5% this means. the result is statistically significant.

Task 3 Research Question: I there an association between hair color abd eye color? Null hypothesis: Hair color and eye color are independent (no association)

I had to use different data set because in first one there is just one categorical variable

mydata3 <- HairEyeColor
head(mydata3)
## , , Sex = Male
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    32   11    10     3
##   Brown    53   50    25    15
##   Red      10   10     7     7
##   Blond     3   30     5     8
## 
## , , Sex = Female
## 
##        Eye
## Hair    Brown Blue Hazel Green
##   Black    36    9     5     2
##   Brown    66   34    29    14
##   Red      16    7     7     7
##   Blond     4   64     5     8
# Create the contingency table using the matrix function
mytable <- matrix(c(32, 53, 10, 3,  # Brown eyes
                    11, 50, 10, 30, # Blue eyes
                    10, 25, 7, 5,   # Hazel eyes
                    3, 15, 7, 8),   # Green eyes
                  nrow = 4)
rownames(mytable) <- c("Black", "Brown", "Red", "Blond")
colnames(mytable) <- c("Brown", "Blue", "Hazel", "Green")

print(mytable)
##       Brown Blue Hazel Green
## Black    32   11    10     3
## Brown    53   50    25    15
## Red      10   10     7     7
## Blond     3   30     5     8
chi_squared <- chisq.test(mytable,
                          correct= FALSE)
## Warning in chisq.test(mytable, correct = FALSE): Chi-squared approximation may
## be incorrect
chi_squared
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 41.28, df = 9, p-value = 4.447e-06

Based on Chi-Square test, the is statistically significant association between hair color and eye color.

addmargins(chi_squared$observed)
##       Brown Blue Hazel Green Sum
## Black    32   11    10     3  56
## Brown    53   50    25    15 143
## Red      10   10     7     7  34
## Blond     3   30     5     8  46
## Sum      98  101    47    33 279

`

addmargins(round(chi_squared$expected, 2))
##       Brown   Blue Hazel Green    Sum
## Black 19.67  20.27  9.43  6.62  55.99
## Brown 50.23  51.77 24.09 16.91 143.00
## Red   11.94  12.31  5.73  4.02  34.00
## Blond 16.16  16.65  7.75  5.44  46.00
## Sum   98.00 101.00 47.00 32.99 278.99
round(chi_squared$res, 2)
##       Brown  Blue Hazel Green
## Black  2.78 -2.06  0.18 -1.41
## Brown  0.39 -0.25  0.19 -0.47
## Red   -0.56 -0.66  0.53  1.49
## Blond -3.27  3.27 -0.99  1.10

Significan deviations exist, particularly for: Black-haired individuals with brown eyes (2.78) difference is found at Alpha 1%. Blond-haired individuals with blue eyes (3.27) at Alpha 1%.

addmargins(round(prop.table(chi_squared$observed), 3))
##       Brown  Blue Hazel Green   Sum
## Black 0.115 0.039 0.036 0.011 0.201
## Brown 0.190 0.179 0.090 0.054 0.513
## Red   0.036 0.036 0.025 0.025 0.122
## Blond 0.011 0.108 0.018 0.029 0.166
## Sum   0.352 0.362 0.169 0.119 1.002
addmargins(round(prop.table(chi_squared$observed, 1), 3), 2)
##       Brown  Blue Hazel Green   Sum
## Black 0.571 0.196 0.179 0.054 1.000
## Brown 0.371 0.350 0.175 0.105 1.001
## Red   0.294 0.294 0.206 0.206 1.000
## Blond 0.065 0.652 0.109 0.174 1.000
addmargins(round(prop.table(chi_squared$observed, 2), 3), 1)
##       Brown  Blue Hazel Green
## Black 0.327 0.109 0.213 0.091
## Brown 0.541 0.495 0.532 0.455
## Red   0.102 0.099 0.149 0.212
## Blond 0.031 0.297 0.106 0.242
## Sum   1.001 1.000 1.000 1.000
library(effectsize)
effectsize::cramers_v(mytable)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.20              | [0.09, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
library(effectsize)
interpret_cramers_v(0.20)
## [1] "medium"
## (Rules: funder2019)

Cramer V effect size indicates a medium-strenght association.