importdata <- read.table ("./responses.csv", header=TRUE, sep=",", dec=".")
#Selected just the variables relating to music genre.
customdata <- importdata %>% dplyr::select(3, 4, 9, 12
)
#Inserted ID column
customdata1 <- tibble::rowid_to_column(customdata, "ID")
#Changed missing data to NA and removed all NA's from the data.
nachange <- customdata1 %>%
na_if ('') %>%
drop_na()
head(nachange)
## ID Dance Folk Rock Hiphop..Rap
## 1 1 2 1 5 1
## 2 2 2 1 5 1
## 3 3 2 2 5 1
## 4 4 2 1 2 2
## 5 5 4 3 3 5
## 6 6 2 3 5 4
Research Question 1:
Are there differences in enjoyment of music Genres among students?
H0: Students enjoy all music Genres equally
H1: Students enjoy the Rock music Genre the most
Unit of Observation: Students
Variables:
-Dance How much a student enjoys Dance music, from Donβt enjoy at all 1-2-3-4-5 to Enjoy very much
-Folk How much a student enjoys Folk music, from Donβt enjoy at all 1-2-3-4-5 to Enjoy very much
-Punk How much a student enjoys Punk music, from Donβt enjoy at all 1-2-3-4-5 to Enjoy very much
-Rock How much a student enjoys Rock music, from Donβt enjoy at all 1-2-3-4-5 to Enjoy very much
#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR1 <- nachange %>% sample_frac(.2)
#Changed data from wide format to long, gathered music Genres under new variable "Genre". Changed this new variable to a factor.
mydata_longR1 <- mydataR1 %>%
gather(key = "Genre",
value = "Enjoyment",
Dance, Folk, Hiphop..Rap, Rock,) %>%
convert_as_factor(Genre)
head(mydata_longR1)
## ID Genre Enjoyment
## 1 850 Dance 1
## 2 691 Dance 4
## 3 132 Dance 3
## 4 947 Dance 2
## 5 521 Dance 3
## 6 482 Dance 4
#Descriptive statistics of Enjoyment by Genre of music.
mydata_longR1 %>%
group_by(Genre) %>%
get_summary_stats(Enjoyment, type = "common")
## # A tibble: 4 Γ 11
## Genre variable n min max median iqr mean sd se ci
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dance Enjoyment 198 1 5 3 2 3.08 1.13 0.08 0.158
## 2 Folk Enjoyment 198 1 5 2 2 2.22 1.13 0.08 0.158
## 3 Hiphop..Rap Enjoyment 198 1 5 3 2 2.88 1.36 0.096 0.19
## 4 Rock Enjoyment 198 1 5 4 2 3.76 1.20 0.085 0.168
Initial descriptive statistics reveal that the differences exist in the mean enjoyment of each Genre. To test whether this is significant, we will consider One-way repeated measures ANOVA.
First we must check if assumptions are met.
#created a boxplot the spread of each Genre. Units shown as individual plots to visualize outliers
ggboxplot(mydata_longR1,
x = "Genre",
y = "Enjoyment",
add = "jitter"
)
#Checking if variables are normally distributed with Shapiro-Wilk test
mydata_longR1 %>%
group_by(Genre) %>%
shapiro_test(Enjoyment)
## # A tibble: 4 Γ 4
## Genre variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Dance Enjoyment 0.916 3.66e- 9
## 2 Folk Enjoyment 0.859 1.44e-12
## 3 Hiphop..Rap Enjoyment 0.897 1.76e-10
## 4 Rock Enjoyment 0.855 9.02e-13
H0: The variable is not normally distributed
H1: The variable is normally distributed
All p values are below 0.05 so we cannot reject the null hypothesis for any variable, none of our variables are normally distributed. Therefore, we must use a non-parametric test for testing our hypothesis.
In this case we are testing the hypothesis about the equality of three or more population arithmetic means for dependent samples. We have ordinal variables. Therefore, the appropriate non-parametric test is Friedman Anova.
#Computing Friedman ANOVA
FriedmanANOVAR1 <- friedman_test(Enjoyment ~ Genre | ID,
data = mydata_longR1)
FriedmanANOVAR1
## # A tibble: 1 Γ 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <dbl> <dbl> <chr>
## 1 Enjoyment 198 130. 3 5.71e-28 Friedman test
H:0 Median of Dance = Folk = Pop = Rock
H:1 At least one Median of the music Genres is not equal to the others
p value is below 0.0001 so we can reject null hypothesis. Distributions of the median are not the same.
#calculating the effect size of Friedman ANOVA
effectsize::kendalls_w(Enjoyment ~ Genre | ID,
data = mydata_longR1)
## Warning: 162 block(s) contain ties, some containing only 1 unique ranking.
## Kendall's W | 95% CI
## --------------------------
## 0.22 | [0.18, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
The agreement of the medians in these four variables is only 0.22
#
interpret_kendalls_w(.22)
## [1] "fair agreement"
## (Rules: landis1977)
Kendall W effect size that there is only fair agreement of students on the music Genre they enjoy
#Wilcoxon signed rank tests - comparing all possible pairs to see which ones have differences. P values adjusted using Bonferroni. Multiplying p value by number of variables.
paires_nonparR1 <- wilcox_test(Enjoyment ~ Genre,
paired = TRUE,
p.adjust.method = "bonferroni",
data = mydata_longR1)
paires_nonparR1
## # A tibble: 6 Γ 9
## .y. group1 group2 n1 n2 statiβ¦ΒΉ p p.adj p.adjβ¦Β²
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Enjoyment Dance Folk 198 198 10590. 4.59e-11 2.75e-10 ****
## 2 Enjoyment Dance Hiphop..R⦠198 198 4986 5.5 e- 2 3.3 e- 1 ns
## 3 Enjoyment Dance Rock 198 198 3289 2.29e- 7 1.37e- 6 ****
## 4 Enjoyment Folk Hiphop..R⦠198 198 3438. 1.52e- 6 9.12e- 6 ****
## 5 Enjoyment Folk Rock 198 198 1180. 4.73e-21 2.84e-20 ****
## 6 Enjoyment Hiphop..Rap Rock 198 198 3158. 9.6 e- 9 5.76e- 8 ****
## # β¦ with abbreviated variable names ΒΉβstatistic, Β²βp.adj.signif
All of the groups of Genre are significantly different from one another apart from Hiphop..Rap and Dance.
library(rstatix)
comparisons <- paires_nonparR1 %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydata_longR1, x = "Genre", y = "Enjoyment", add = "point", ylim=c(0, 18)) +
stat_pvalue_manual(comparisons, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = Genre), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.8),
geom = "text", vjust = 0.5, hjust = -8,
aes(label = round(after_stat(y), digits = 2), group = Genre)) +
labs(subtitle = get_test_label(FriedmanANOVAR1, detailed = TRUE),
caption = get_pwc_label(comparisons))
We found that the distribution of Enjoyment levels differs for at least one of the music genres(π2 =98.98, π<0.0001), the agreement of respondents was βfairβ (π198 = 0.22). Post-Hoc tests revealed differences for each pair of groups (π<0.0001).
Research Question 2:
Are there differences in the stated saving habits of male and female students?
H0: There are no differences in the stated saving habits of male and female students
H1: Female students state they try to save more of their money than male students
Unit of Observation: Students
Variables:
-Gender Gender of student, Male or Female
-Finances How much a student agrees they try to save all money they can, from strongly agree 1-5 to strongly disagree
#Selected just the variables relating Saving Habits.
customdata2 <- importdata %>% dplyr::select(134, 145,
)
#Inserted ID column.
customdata3 <- tibble::rowid_to_column(customdata2, "ID")
#Changed missing data to NA and removed all NA's from the data.
nachange2 <- customdata3 %>%
na_if ('') %>%
drop_na()
head(nachange2)
## ID Finances Gender
## 1 1 3 female
## 2 2 3 female
## 3 3 2 female
## 4 4 2 female
## 5 5 4 female
## 6 6 2 male
#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR2 <- nachange2 %>% sample_frac(.2)
#Changed Gender variable to a factor.
mydataR2$Gender1 <- factor(mydataR2$Gender,
labels = c("Male", "Female"))
#Descriptive statistics of Finances split by Gender
psych::describeBy(mydataR2$Finances, g = mydataR2$Gender)
##
## Descriptive statistics by group
## group: female
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 121 3.07 1.21 3 3.09 1.48 1 5 4 -0.14 -0.84 0.11
## ------------------------------------------------------------
## group: male
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 79 2.75 1.1 3 2.75 1.48 1 5 4 -0.01 -0.93 0.12
Initial descriptive statistics show differences in the mean of both groups and there suggesting a difference between population arithmetic means. Further testing is needed, in this case we have independent samples and we will consider using a Paired t-test, but first we must check assumptions.
#Histogram of Finances by split Gender
ggplot(mydataR2, aes(x = Finances)) +
geom_histogram(binwidth = 1, colour="gray") +
facet_wrap(~ Gender, ncol = 1) +
ylab("Frequency")
#shapiro test to see if distributions are normal
mydataR2 %>%
group_by(Gender1) %>%
shapiro_test(Finances)
## # A tibble: 2 Γ 4
## Gender1 variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Male Finances 0.911 0.000000719
## 2 Female Finances 0.908 0.0000287
H0: The variable is not normally distributed
H1: The variable is normally distributed
All p values are below 0.05 so we cannot reject the null hypothesis for any variable, none of our variables are normally distributed. Therefore, we must use a non-parametric test for testing our hypothesis.
In this case we are testing the hypothesis about the difference between two population arithmetic means. The appropriate non parametric test is Wilcoxon Signed Rank Test.
wilcox.test(mydataR2$Finances ~ mydataR2$Gender1,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided") #comparing all options
##
## Wilcoxon rank sum test
##
## data: mydataR2$Finances by mydataR2$Gender1
## W = 5514.5, p-value = 0.05831
## alternative hypothesis: true location shift is not equal to 0
H0: distribution location shift is equal to 0
H1: distribution location shift is not equal to 0
p value is above 0.001 so we cannot reject the null hypothesis, the two distributions locations of Finances are statistically the same.
#checking effect size of differences
effectsize(wilcox.test(mydataR2$Finances ~ mydataR2$Gender1,
paired = FALSE,
correct = FALSE,
exact = FALSE,
alternative = "two.sided"))
## r (rank biserial) | 95% CI
## ---------------------------------
## 0.15 | [-0.01, 0.31]
interpret_rank_biserial(0.15)
## [1] "small"
## (Rules: funder2019)
Effect size of Gender group on Finances is small
Conclusion:
Based on the sample data, we find that men and women do not differ in their stated saving habits (π> 0.001) the effect size was insignificantly small (0.15).
Research Question 3:
Does drinking affect students enjoyment of Rock music?
H0: The amount a students drinks has no affect enjoyment of rock music
H1: Students who drink more students enjoy rock music more
Unit of Observation: Students
Variables:
-Alcohol How much alcohol a students consumes, Never, Social Drinker, Drink A Lot
-Pop How much a student enjoys Pop music, from Donβt enjoy at all 1-2-3-4-5 to Enjoy very much
#Selected just the variables relating to Drinking and Music Taste.
customdata4 <- importdata %>% dplyr::select(9, 75,
)
#Inserted ID column
customdata5 <- tibble::rowid_to_column(customdata4, "ID")
#Changed missing data to NA and removed all NA's from the data.
nachange3 <- customdata4 %>%
na_if ('') %>%
drop_na()
head(nachange3)
## Rock Alcohol
## 1 5 drink a lot
## 2 5 drink a lot
## 3 5 drink a lot
## 4 2 drink a lot
## 5 3 social drinker
## 6 5 never
#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR3 <- nachange3 %>% sample_frac(.2)
#Changed Alcohol categories to factors.
mydataR3$AlcoholF1 <- factor(mydataR3$Alcohol,
levels = c("never", "social drinker","drink a lot"),
labels = c("Never Drink","Social Drinker","Drink A lot"))
#Descriptive statistics of Rock and of Rock by level of alcohol consumption.
psych::describe(mydataR3$Rock)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 200 3.87 1.17 4 4.03 1.48 1 5 4 -0.86 -0.2 0.08
psych::describeBy(x = mydataR3$Rock, group = mydataR3$AlcoholF1)
##
## Descriptive statistics by group
## group: Never Drink
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 20 3.7 1.03 4 3.75 1.48 2 5 3 -0.24 -1.2 0.23
## ------------------------------------------------------------
## group: Social Drinker
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 135 3.86 1.19 4 4.01 1.48 1 5 4 -0.83 -0.3 0.1
## ------------------------------------------------------------
## group: Drink A lot
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 45 3.98 1.2 4 4.16 1.48 1 5 4 -1.13 0.33 0.18
Initial descriptive statistics reveal that the more a student drinks the higher the mean score of enjoyment of rock. To test whether this is significant, we will consider One-way ANOVA.
First we must check if assumptions are met.
#Checking for homogeneity of variance in Rock grouped by alcohol consumption.
car::leveneTest(mydataR3$Rock, group = mydataR3$AlcoholF1)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.2036 0.816
## 197
H:0 Variances of Rock is the same in all groups
H:1 Variances of Rock is not the same in all groups
P values are above .05 and therefore not statistically significant, therefore we can accept the null that the variance of Rock is the same in all groups of Alcohol.
#Checking for distributions.
mydataR3 %>%
group_by(AlcoholF1) %>%
shapiro_test(Rock)
## # A tibble: 3 Γ 4
## AlcoholF1 variable statistic p
## <fct> <chr> <dbl> <dbl>
## 1 Never Drink Rock 0.878 1.66e- 2
## 2 Social Drinker Rock 0.834 4.70e-11
## 3 Drink A lot Rock 0.790 1.43e- 6
H:0 Distribution of Rock enjoyment is normally distributed
H:1 Distribution of Rock enjoyment is not normally distributed
p value is lower than 0.05, so we cannot accept null, Rock enjoyment is not normally distributed.
The assumption of normality has been violated and therefore we cannot use One-way ANOVA, instead we must use a non parametric test, Kruskal-Wallis rank sum test.
#Testing hypothesis with Kruskal-Wallis rank sum test.
kruskal.test(Rock ~ AlcoholF1,
data = mydataR3)
##
## Kruskal-Wallis rank sum test
##
## data: Rock by AlcoholF1
## Kruskal-Wallis chi-squared = 1.5821, df = 2, p-value = 0.4534
H:0 Medians of Rock are the same in all three groups of alcohol consumption
H:1 At least 1 median of Rock is different in the three groups of alcohol consumption
p value is is higher than 0.05 than, therefore we accept null hypothesis, none of the medians of Rock enjoyment are different from the groups of alcohol consumption
#Checking effect size.
kruskal_effsize(Rock ~ AlcoholF1,
data = mydataR3)
## # A tibble: 1 Γ 5
## .y. n effsize method magnitude
## * <chr> <int> <dbl> <chr> <ord>
## 1 Rock 200 -0.00212 eta2[H] small
Effect Size of alcohol consumption on Rock enjoyment is too small to be significant
#Wilcoxon signed rank tests - comparing all possible pairs to see which ones have differences. P values adjusted using Bonferroni. Multiplying p value by number of variables.
groups_nonparR3 <- wilcox_test(Rock ~ AlcoholF1,
paired = FALSE,
p.adjust.method = "bonferroni",
data = mydataR3)
groups_nonparR3
## # A tibble: 3 Γ 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.β¦ΒΉ
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 Rock Never Drink Social Drinker 20 135 1188 0.368 1 ns
## 2 Rock Never Drink Drink A lot 20 45 363 0.197 0.591 ns
## 3 Rock Social Drinker Drink A lot 135 45 2846. 0.508 1 ns
## # β¦ with abbreviated variable name ΒΉβp.adj.signif
None of the groups of alcohol are significantly different in the affect on rock enjoyment
#Plotting the results of the Wilcoxon signed rank tests and Kruskal-Wallis rank sum test in a boxplot to visualize the medians of Alcohol Factor.
pwc <- mydataR3 %>%
wilcox_test(Rock ~ AlcoholF1,
paired = FALSE,
p.adjust.method = "bonferroni")
Kruskal_results <- kruskal_test(Rock ~ AlcoholF1,
data = mydataR3)
library(rstatix)
pwc <- pwc %>%
add_y_position(fun = "median", step.increase = 0.35)
library(ggpubr)
ggboxplot(mydataR3, x = "AlcoholF1", y = "Rock", add = "point", ylim=c(0, 12)) +
stat_pvalue_manual(pwc, hide.ns = FALSE) +
stat_summary(fun = median, geom = "point", shape = 16, size = 4,
aes(group = AlcoholF1), color = "darkred",
position = position_dodge(width = 0.8)) +
stat_summary(fun = median, colour = "darkred",
position = position_dodge(width = 0.8),
geom = "text", vjust = -0.5, hjust = -1,
aes(label = round(after_stat(y), digits = 2), group = AlcoholF1)) +
labs(subtitle = get_test_label(Kruskal_results, detailed = TRUE),
caption = get_pwc_label(pwc))
We found that the distribution of Rock enjoyment does not differ for any of the levels of alcohol consumption. (π2 =1.58, π< 0.05), the effect size was insignificantly small (π2 = -0.0021). Post-Hoc tests revealed no differences for each pair of groups (π> 0.01).