Loading the dataset
# Load the iris dataset
data("iris")
# Preview the dataset
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Data set explanation
Hypothesis Testing (Sepal.Lenght by Species) Research Question: Is there a significant difference in Sepal.Lenght between two species (setosa and versicolor)?
Data Preparation
# Filter data for two species
## install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mydata2 <- iris %>%
filter(Species %in% c("setosa", "versicolor"))
print(head(mydata2))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
Normality Test
setosa_sepal <- mydata2$Sepal.Length[mydata2$Species == "setosa"]
versicolor_sepal <- mydata2$Sepal.Length[mydata2$Species == "versicolor"]
shapiro.test(setosa_sepal)
##
## Shapiro-Wilk normality test
##
## data: setosa_sepal
## W = 0.9777, p-value = 0.4595
shapiro.test(versicolor_sepal)
##
## Shapiro-Wilk normality test
##
## data: versicolor_sepal
## W = 0.97784, p-value = 0.4647
# Histogram for setosa
ggplot(subset(mydata2, Species == "setosa"), aes(x = Sepal.Length)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "dodgerblue") +
theme_minimal() +
xlab("Sepal Length") +
ylab("Frequency") +
labs(title = "Histogram of Sepal.Length for Setosa")
# Histogram for versicolor
ggplot(subset(mydata2, Species == "versicolor"), aes(x = Sepal.Length)) +
geom_histogram(binwidth = 0.5, color = "black", fill = "tomato") +
theme_minimal() +
xlab("Sepal Length") +
ylab("Frequency") +
labs(title = "Histogram of Sepal.Length for Versicolor")
# install psych
## install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe.by(mydata2$Sepal.Length , mydata2$Species, mat=TRUE)
## Warning in describe.by(mydata2$Sepal.Length, mydata2$Species, mat = TRUE):
## describe.by is deprecated. Please use the describeBy function
## item group1 vars n mean sd median trimmed mad min max
## X11 1 setosa 1 50 5.006 0.3524897 5.0 5.0025 0.29652 4.3 5.8
## X12 2 versicolor 1 50 5.936 0.5161711 5.9 5.9375 0.51891 4.9 7.0
## X13 3 virginica NA NA NA NA NA NA NA NA NA
## range skew kurtosis se
## X11 1.5 0.11297784 -0.4508724 0.04984957
## X12 2.1 0.09913926 -0.6939138 0.07299762
## X13 NA NA NA NA
So the variances are not equal, Welch correction needed.
t.test(Sepal.Length ~ Species,
data = mydata2,
var.equal= FALSE)
##
## Welch Two Sample t-test
##
## data: Sepal.Length by Species
## t = -10.521, df = 86.538, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group setosa and group versicolor is not equal to 0
## 95 percent confidence interval:
## -1.1057074 -0.7542926
## sample estimates:
## mean in group setosa mean in group versicolor
## 5.006 5.936
So we reject null hypothesis that the means are equal at p<0.001. need to calculate the effect size now.
## install.packages("effectsize")
library(effectsize)
##
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
##
## phi
effectsize :: cohens_d(Sepal.Length ~ Species,
data=mydata2)
## Cohen's d | 95% CI
## --------------------------
## -2.10 | [-2.59, -1.61]
##
## - Estimated using pooled SD.
interpret_cohens_d(2.10 , rules = "sawilowsky2009")
## [1] "huge"
## (Rules: sawilowsky2009)
Now. just in case performing non-parametric test. Wilcoxon Rank Sum test.
wilcox.test(Sepal.Length ~ Species,
data= mydata2)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Sepal.Length by Species
## W = 168.5, p-value = 8.346e-14
## alternative hypothesis: true location shift is not equal to 0
There is a significant difference in loation distribution of Sepal.Lenght between setosa and versicolor. The non parametric test confirms the findings of the parametric test. We choose parametric test because the assumptions of numerical values and normality were met.
Task 2: Research Queston 2: Is there a significant correlation between Sepal.Length and Petal.Length in the dataset?
Testing Normaity assumption
shapiro_sepal <- shapiro.test(mydata2$Sepal.Length)
shapiro_petal <- shapiro.test(mydata2$Petal.Length)
shapiro_petal
##
## Shapiro-Wilk normality test
##
## data: mydata2$Petal.Length
## W = 0.80185, p-value = 2.824e-10
shapiro_sepal
##
## Shapiro-Wilk normality test
##
## data: mydata2$Sepal.Length
## W = 0.96964, p-value = 0.02076
So based on the p-values we conclude that both variables are. not normally distributed; we should use Speraman’s rank correlation coefficient, which compared to Pearsn corellation des not require the normality assumption.
Scatterplot Matrix
##install.packages("car")
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
scatterplotMatrix(mydata2[ , c("Sepal.Length", "Petal.Length")], smooth = FALSE)
##install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(mydata2[, c("Sepal.Length","Petal.Length")])
install.packages("Hmisc")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
I couldn’t install packages “Hmisc” so I found this way of writing it differently.
# Load necessary libraries
library(dplyr)
# Prepare data (filtering for setosa and versicolor as mydata2)
mydata2 <- iris %>% filter(Species %in% c("setosa", "versicolor"))
# Perform Spearman correlation test
spearman_test <- cor.test(mydata2$Sepal.Length, mydata2$Petal.Length, method = "spearman")
## Warning in cor.test.default(mydata2$Sepal.Length, mydata2$Petal.Length, :
## Cannot compute exact p-value with ties
# Display the result
spearman_test
##
## Spearman's rank correlation rho
##
## data: mydata2$Sepal.Length and mydata2$Petal.Length
## S = 30740, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8155428
Sperman’s rho is 0.8155428 which indicates strong positive correlationbetween Sepal.Length and Petal.Length fo the two species (setosa and versicolor), p-value is <0.001 which is much smaller the 5% this means. the result is statistically significant.
Task 3 Research Question: I there an association between hair color abd eye color? Null hypothesis: Hair color and eye color are independent (no association)
I had to use different data set because in first one there is just one categorical variable
mydata3 <- HairEyeColor
head(mydata3)
## , , Sex = Male
##
## Eye
## Hair Brown Blue Hazel Green
## Black 32 11 10 3
## Brown 53 50 25 15
## Red 10 10 7 7
## Blond 3 30 5 8
##
## , , Sex = Female
##
## Eye
## Hair Brown Blue Hazel Green
## Black 36 9 5 2
## Brown 66 34 29 14
## Red 16 7 7 7
## Blond 4 64 5 8
# Create the contingency table using the matrix function
mytable <- matrix(c(32, 53, 10, 3, # Brown eyes
11, 50, 10, 30, # Blue eyes
10, 25, 7, 5, # Hazel eyes
3, 15, 7, 8), # Green eyes
nrow = 4)
rownames(mytable) <- c("Black", "Brown", "Red", "Blond")
colnames(mytable) <- c("Brown", "Blue", "Hazel", "Green")
print(mytable)
## Brown Blue Hazel Green
## Black 32 11 10 3
## Brown 53 50 25 15
## Red 10 10 7 7
## Blond 3 30 5 8
chi_squared <- chisq.test(mytable,
correct= FALSE)
## Warning in chisq.test(mytable, correct = FALSE): Chi-squared approximation may
## be incorrect
chi_squared
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 41.28, df = 9, p-value = 4.447e-06
Based on Chi-Square test, the is statistically significant association between hair color and eye color.
addmargins(chi_squared$observed)
## Brown Blue Hazel Green Sum
## Black 32 11 10 3 56
## Brown 53 50 25 15 143
## Red 10 10 7 7 34
## Blond 3 30 5 8 46
## Sum 98 101 47 33 279
`
addmargins(round(chi_squared$expected, 2))
## Brown Blue Hazel Green Sum
## Black 19.67 20.27 9.43 6.62 55.99
## Brown 50.23 51.77 24.09 16.91 143.00
## Red 11.94 12.31 5.73 4.02 34.00
## Blond 16.16 16.65 7.75 5.44 46.00
## Sum 98.00 101.00 47.00 32.99 278.99
round(chi_squared$res, 2)
## Brown Blue Hazel Green
## Black 2.78 -2.06 0.18 -1.41
## Brown 0.39 -0.25 0.19 -0.47
## Red -0.56 -0.66 0.53 1.49
## Blond -3.27 3.27 -0.99 1.10
Significan deviations exist, particularly for: Black-haired individuals with brown eyes (2.78) difference is found at Alpha 1%. Blond-haired individuals with blue eyes (3.27) at Alpha 1%.
addmargins(round(prop.table(chi_squared$observed), 3))
## Brown Blue Hazel Green Sum
## Black 0.115 0.039 0.036 0.011 0.201
## Brown 0.190 0.179 0.090 0.054 0.513
## Red 0.036 0.036 0.025 0.025 0.122
## Blond 0.011 0.108 0.018 0.029 0.166
## Sum 0.352 0.362 0.169 0.119 1.002
addmargins(round(prop.table(chi_squared$observed, 1), 3), 2)
## Brown Blue Hazel Green Sum
## Black 0.571 0.196 0.179 0.054 1.000
## Brown 0.371 0.350 0.175 0.105 1.001
## Red 0.294 0.294 0.206 0.206 1.000
## Blond 0.065 0.652 0.109 0.174 1.000
addmargins(round(prop.table(chi_squared$observed, 2), 3), 1)
## Brown Blue Hazel Green
## Black 0.327 0.109 0.213 0.091
## Brown 0.541 0.495 0.532 0.455
## Red 0.102 0.099 0.149 0.212
## Blond 0.031 0.297 0.106 0.242
## Sum 1.001 1.000 1.000 1.000
library(effectsize)
effectsize::cramers_v(mytable)
## Cramer's V (adj.) | 95% CI
## --------------------------------
## 0.20 | [0.09, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
library(effectsize)
interpret_cramers_v(0.20)
## [1] "medium"
## (Rules: funder2019)
Cramer V effect size indicates a medium-strenght association.