Homework 2 Students Attitudes

importdata <-  read.table ("./responses.csv", header=TRUE, sep=",", dec=".")

#Selected just the variables relating to music genre.
customdata <- importdata %>% dplyr::select(3, 4, 9, 12
)

#Inserted ID column
customdata1 <- tibble::rowid_to_column(customdata, "ID")


#Changed missing data to NA and removed all NA's from the data.
nachange <- customdata1 %>% 
              na_if ('') %>% 
              drop_na()

head(nachange)

##   ID Dance Folk Rock Hiphop..Rap
## 1  1     2    1    5           1
## 2  2     2    1    5           1
## 3  3     2    2    5           1
## 4  4     2    1    2           2
## 5  5     4    3    3           5
## 6  6     2    3    5           4

Research Question 1:

Are there differences in enjoyment of music Genres among students?

H0: Students enjoy all music Genres equally

H1: Students enjoy the Rock music Genre the most

Unit of Observation: Students

Variables:

-Dance How much a student enjoys Dance music, from Don’t enjoy at all 1-2-3-4-5 to Enjoy very much

-Folk How much a student enjoys Folk music, from Don’t enjoy at all 1-2-3-4-5 to Enjoy very much

-Punk How much a student enjoys Punk music, from Don’t enjoy at all 1-2-3-4-5 to Enjoy very much

-Rock How much a student enjoys Rock music, from Don’t enjoy at all 1-2-3-4-5 to Enjoy very much

#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR1 <- nachange %>% sample_frac(.2)

#Changed data from wide format to long, gathered music Genres under new variable "Genre". Changed this new variable to a factor.
mydata_longR1 <- mydataR1 %>%
                 gather(key = "Genre", 
                        value = "Enjoyment", 
                        Dance,  Folk,   Hiphop..Rap,    Rock,) %>%
                 convert_as_factor(Genre)

head(mydata_longR1)

##    ID Genre Enjoyment
## 1 850 Dance         1
## 2 691 Dance         4
## 3 132 Dance         3
## 4 947 Dance         2
## 5 521 Dance         3
## 6 482 Dance         4

#Descriptive statistics of Enjoyment by Genre of music.
mydata_longR1 %>%
  group_by(Genre) %>%
  get_summary_stats(Enjoyment, type = "common")

## # A tibble: 4 × 11
##   Genre       variable      n   min   max median   iqr  mean    sd    se    ci
##   <fct>       <fct>     <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Dance       Enjoyment   198     1     5      3     2  3.08  1.13 0.08  0.158
## 2 Folk        Enjoyment   198     1     5      2     2  2.22  1.13 0.08  0.158
## 3 Hiphop..Rap Enjoyment   198     1     5      3     2  2.88  1.36 0.096 0.19 
## 4 Rock        Enjoyment   198     1     5      4     2  3.76  1.20 0.085 0.168

Initial descriptive statistics reveal that the differences exist in the mean enjoyment of each Genre. To test whether this is significant, we will consider One-way repeated measures ANOVA.

First we must check if assumptions are met.

#created a boxplot the spread of each Genre. Units shown as individual plots to visualize outliers
ggboxplot(mydata_longR1,  
          x = "Genre", 
          y = "Enjoyment",
          add = "jitter"
)

#Checking if variables are normally distributed with Shapiro-Wilk test
mydata_longR1 %>%
  group_by(Genre) %>%
  shapiro_test(Enjoyment)

## # A tibble: 4 × 4
##   Genre       variable  statistic        p
##   <fct>       <chr>         <dbl>    <dbl>
## 1 Dance       Enjoyment     0.916 3.66e- 9
## 2 Folk        Enjoyment     0.859 1.44e-12
## 3 Hiphop..Rap Enjoyment     0.897 1.76e-10
## 4 Rock        Enjoyment     0.855 9.02e-13

H0: The variable is not normally distributed

H1: The variable is normally distributed

All p values are below 0.05 so we cannot reject the null hypothesis for any variable, none of our variables are normally distributed. Therefore, we must use a non-parametric test for testing our hypothesis.

In this case we are testing the hypothesis about the equality of three or more population arithmetic means for dependent samples. We have ordinal variables. Therefore, the appropriate non-parametric test is Friedman Anova.

#Computing Friedman ANOVA
FriedmanANOVAR1 <- friedman_test(Enjoyment ~ Genre | ID,
                               data = mydata_longR1)

FriedmanANOVAR1

## # A tibble: 1 × 6
##   .y.           n statistic    df        p method       
## * <chr>     <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 Enjoyment   198      130.     3 5.71e-28 Friedman test

H:0 Median of Dance = Folk = Pop = Rock

H:1 At least one Median of the music Genres is not equal to the others

p value is below 0.0001 so we can reject null hypothesis. Distributions of the median are not the same.

#calculating the effect size of Friedman ANOVA
effectsize::kendalls_w(Enjoyment ~ Genre | ID,
                       data = mydata_longR1)

## Warning: 162 block(s) contain ties, some containing only 1 unique ranking.

## Kendall's W |       95% CI
## --------------------------
## 0.22        | [0.18, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

The agreement of the medians in these four variables is only 0.22

#
interpret_kendalls_w(.22)

## [1] "fair agreement"
## (Rules: landis1977)

Kendall W effect size that there is only fair agreement of students on the music Genre they enjoy

#Wilcoxon signed rank tests - comparing all possible pairs to see which ones have differences. P values adjusted using Bonferroni. Multiplying p value by number of variables. 
paires_nonparR1 <- wilcox_test(Enjoyment ~ Genre, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_longR1)

paires_nonparR1

## # A tibble: 6 × 9
##   .y.       group1      group2        n1    n2 stati…¹        p    p.adj p.adj…²
## * <chr>     <chr>       <chr>      <int> <int>   <dbl>    <dbl>    <dbl> <chr>  
## 1 Enjoyment Dance       Folk         198   198  10590. 4.59e-11 2.75e-10 ****   
## 2 Enjoyment Dance       Hiphop..R…   198   198   4986  5.5 e- 2 3.3 e- 1 ns     
## 3 Enjoyment Dance       Rock         198   198   3289  2.29e- 7 1.37e- 6 ****   
## 4 Enjoyment Folk        Hiphop..R…   198   198   3438. 1.52e- 6 9.12e- 6 ****   
## 5 Enjoyment Folk        Rock         198   198   1180. 4.73e-21 2.84e-20 ****   
## 6 Enjoyment Hiphop..Rap Rock         198   198   3158. 9.6 e- 9 5.76e- 8 ****   
## # … with abbreviated variable names ¹statistic, ²p.adj.signif

All of the groups of Genre are significantly different from one another apart from Hiphop..Rap and Dance.

library(rstatix)
comparisons <- paires_nonparR1 %>% 
                 add_y_position(fun = "median", step.increase = 0.35)

library(ggpubr)
ggboxplot(mydata_longR1, x = "Genre", y = "Enjoyment", add = "point", ylim=c(0, 18)) +
  stat_pvalue_manual(comparisons, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = Genre), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.8),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Genre)) +
  labs(subtitle = get_test_label(FriedmanANOVAR1,  detailed = TRUE),
       caption = get_pwc_label(comparisons))

We found that the distribution of Enjoyment levels differs for at least one of the music genres(𝜒2 =98.98, 𝑝<0.0001), the agreement of respondents was “fair” (𝜂198 = 0.22). Post-Hoc tests revealed differences for each pair of groups (𝑝<0.0001).

Research Question 2:

Are there differences in the stated saving habits of male and female students?

H0: There are no differences in the stated saving habits of male and female students

H1: Female students state they try to save more of their money than male students

Unit of Observation: Students

Variables:

-Gender Gender of student, Male or Female

-Finances How much a student agrees they try to save all money they can, from strongly agree 1-5 to strongly disagree

#Selected just the variables relating Saving Habits.
customdata2 <- importdata %>% dplyr::select(134, 145,
)

#Inserted ID column.
customdata3 <- tibble::rowid_to_column(customdata2, "ID")


#Changed missing data to NA and removed all NA's from the data.
nachange2 <- customdata3 %>% 
              na_if ('') %>% 
              drop_na()

head(nachange2)

##   ID Finances Gender
## 1  1        3 female
## 2  2        3 female
## 3  3        2 female
## 4  4        2 female
## 5  5        4 female
## 6  6        2   male

#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR2 <- nachange2 %>% sample_frac(.2)

#Changed Gender variable to a factor.
mydataR2$Gender1 <- factor(mydataR2$Gender, 
                        
                         labels = c("Male", "Female"))

#Descriptive statistics of Finances split by Gender
psych::describeBy(mydataR2$Finances, g = mydataR2$Gender)

## 
##  Descriptive statistics by group 
## group: female
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 121 3.07 1.21      3    3.09 1.48   1   5     4 -0.14    -0.84 0.11
## ------------------------------------------------------------ 
## group: male
##    vars  n mean  sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 79 2.75 1.1      3    2.75 1.48   1   5     4 -0.01    -0.93 0.12

Initial descriptive statistics show differences in the mean of both groups and there suggesting a difference between population arithmetic means. Further testing is needed, in this case we have independent samples and we will consider using a Paired t-test, but first we must check assumptions.

#Histogram of Finances by split Gender
ggplot(mydataR2, aes(x = Finances)) +
  geom_histogram(binwidth = 1, colour="gray") +
  facet_wrap(~ Gender, ncol = 1) + 
  ylab("Frequency")

#shapiro test to see if distributions are normal
mydataR2 %>%
  group_by(Gender1) %>%
  shapiro_test(Finances)

## # A tibble: 2 × 4
##   Gender1 variable statistic           p
##   <fct>   <chr>        <dbl>       <dbl>
## 1 Male    Finances     0.911 0.000000719
## 2 Female  Finances     0.908 0.0000287

H0: The variable is not normally distributed

H1: The variable is normally distributed

In this case we are testing the hypothesis about the difference between two population arithmetic means. The appropriate non parametric test is Wilcoxon Signed Rank Test.

wilcox.test(mydataR2$Finances ~ mydataR2$Gender1,
            paired = FALSE,
            correct = FALSE,
            exact = FALSE,
            alternative = "two.sided") #comparing all options

## 
##  Wilcoxon rank sum test
## 
## data:  mydataR2$Finances by mydataR2$Gender1
## W = 5514.5, p-value = 0.05831
## alternative hypothesis: true location shift is not equal to 0

H0: distribution location shift is equal to 0

H1: distribution location shift is not equal to 0

p value is above 0.001 so we cannot reject the null hypothesis, the two distributions locations of Finances are statistically the same.

#checking effect size of differences
effectsize(wilcox.test(mydataR2$Finances ~ mydataR2$Gender1,
                       paired = FALSE,
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))

## r (rank biserial) |        95% CI
## ---------------------------------
## 0.15              | [-0.01, 0.31]

interpret_rank_biserial(0.15)

## [1] "small"
## (Rules: funder2019)

Effect size of Gender group on Finances is small

Conclusion:

Based on the sample data, we find that men and women do not differ in their stated saving habits (𝑝> 0.001) the effect size was insignificantly small (0.15).

Research Question 3:

Does drinking affect students enjoyment of Rock music?

H0: The amount a students drinks has no affect enjoyment of rock music

H1: Students who drink more students enjoy rock music more

Unit of Observation: Students

Variables:

-Alcohol How much alcohol a students consumes, Never, Social Drinker, Drink A Lot

-Pop How much a student enjoys Pop music, from Don’t enjoy at all 1-2-3-4-5 to Enjoy very much

#Selected just the variables relating to Drinking and Music Taste.
customdata4 <- importdata %>% dplyr::select(9, 75,
)

#Inserted ID column
customdata5 <- tibble::rowid_to_column(customdata4, "ID")


#Changed missing data to NA and removed all NA's from the data.
nachange3 <- customdata4 %>% 
              na_if ('') %>% 
              drop_na()

head(nachange3)

##   Rock        Alcohol
## 1    5    drink a lot
## 2    5    drink a lot
## 3    5    drink a lot
## 4    2    drink a lot
## 5    3 social drinker
## 6    5          never

#Created a random sample of the data which always returns the same random sample.
set.seed(1)
mydataR3 <- nachange3 %>% sample_frac(.2)

#Changed Alcohol categories to factors.
mydataR3$AlcoholF1 <- factor(mydataR3$Alcohol, 
                            levels = c("never", "social drinker","drink a lot"),  
                            labels = c("Never Drink","Social Drinker","Drink A lot"))

#Descriptive statistics of Rock and of Rock by level of alcohol consumption.
psych::describe(mydataR3$Rock)

##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 200 3.87 1.17      4    4.03 1.48   1   5     4 -0.86     -0.2 0.08

psych::describeBy(x = mydataR3$Rock, group = mydataR3$AlcoholF1)

## 
##  Descriptive statistics by group 
## group: Never Drink
##    vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 20  3.7 1.03      4    3.75 1.48   2   5     3 -0.24     -1.2 0.23
## ------------------------------------------------------------ 
## group: Social Drinker
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis  se
## X1    1 135 3.86 1.19      4    4.01 1.48   1   5     4 -0.83     -0.3 0.1
## ------------------------------------------------------------ 
## group: Drink A lot
##    vars  n mean  sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 45 3.98 1.2      4    4.16 1.48   1   5     4 -1.13     0.33 0.18

Initial descriptive statistics reveal that the more a student drinks the higher the mean score of enjoyment of rock. To test whether this is significant, we will consider One-way ANOVA.

First we must check if assumptions are met.

#Checking for homogeneity of variance in Rock grouped by alcohol consumption.
car::leveneTest(mydataR3$Rock, group = mydataR3$AlcoholF1)

## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   2  0.2036  0.816
##       197

H:0 Variances of Rock is the same in all groups

H:1 Variances of Rock is not the same in all groups

P values are above .05 and therefore not statistically significant, therefore we can accept the null that the variance of Rock is the same in all groups of Alcohol.

#Checking for distributions.
mydataR3 %>%
  group_by(AlcoholF1) %>%
  shapiro_test(Rock)

## # A tibble: 3 × 4
##   AlcoholF1      variable statistic        p
##   <fct>          <chr>        <dbl>    <dbl>
## 1 Never Drink    Rock         0.878 1.66e- 2
## 2 Social Drinker Rock         0.834 4.70e-11
## 3 Drink A lot    Rock         0.790 1.43e- 6

H:0 Distribution of Rock enjoyment is normally distributed

H:1 Distribution of Rock enjoyment is not normally distributed

p value is lower than 0.05, so we cannot accept null, Rock enjoyment is not normally distributed.

The assumption of normality has been violated and therefore we cannot use One-way ANOVA, instead we must use a non parametric test, Kruskal-Wallis rank sum test.

#Testing hypothesis with Kruskal-Wallis rank sum test.
kruskal.test(Rock ~ AlcoholF1, 
             data = mydataR3)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Rock by AlcoholF1
## Kruskal-Wallis chi-squared = 1.5821, df = 2, p-value = 0.4534

H:0 Medians of Rock are the same in all three groups of alcohol consumption

H:1 At least 1 median of Rock is different in the three groups of alcohol consumption

p value is is higher than 0.05 than, therefore we accept null hypothesis, none of the medians of Rock enjoyment are different from the groups of alcohol consumption

#Checking effect size.
kruskal_effsize(Rock ~ AlcoholF1, 
                data = mydataR3)

## # A tibble: 1 × 5
##   .y.       n  effsize method  magnitude
## * <chr> <int>    <dbl> <chr>   <ord>    
## 1 Rock    200 -0.00212 eta2[H] small

Effect Size of alcohol consumption on Rock enjoyment is too small to be significant

#Wilcoxon signed rank tests - comparing all possible pairs to see which ones have differences. P values adjusted using Bonferroni. Multiplying p value by number of variables. 
groups_nonparR3 <- wilcox_test(Rock ~ AlcoholF1,
                             paired = FALSE,
                             p.adjust.method = "bonferroni",
                             data = mydataR3)

groups_nonparR3

## # A tibble: 3 × 9
##   .y.   group1         group2            n1    n2 statistic     p p.adj p.adj.…¹
## * <chr> <chr>          <chr>          <int> <int>     <dbl> <dbl> <dbl> <chr>   
## 1 Rock  Never Drink    Social Drinker    20   135     1188  0.368 1     ns      
## 2 Rock  Never Drink    Drink A lot       20    45      363  0.197 0.591 ns      
## 3 Rock  Social Drinker Drink A lot      135    45     2846. 0.508 1     ns      
## # … with abbreviated variable name ¹p.adj.signif

None of the groups of alcohol are significantly different in the affect on rock enjoyment

#Plotting the results of the Wilcoxon signed rank tests and Kruskal-Wallis rank sum test in a boxplot to visualize the medians of Alcohol Factor. 
pwc <- mydataR3 %>%
  wilcox_test(Rock ~ AlcoholF1, 
              paired = FALSE, 
              p.adjust.method = "bonferroni")
 
Kruskal_results <- kruskal_test(Rock ~ AlcoholF1, 
                                data = mydataR3)

library(rstatix)
pwc <- pwc %>% 
       add_y_position(fun = "median", step.increase = 0.35)

library(ggpubr)
ggboxplot(mydataR3, x = "AlcoholF1", y = "Rock", add = "point", ylim=c(0, 12)) +
  stat_pvalue_manual(pwc, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = AlcoholF1), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.8),
               geom = "text", vjust = -0.5, hjust = -1,
               aes(label = round(after_stat(y), digits = 2), group = AlcoholF1)) +
  labs(subtitle = get_test_label(Kruskal_results,  detailed = TRUE),
       caption = get_pwc_label(pwc))

We found that the distribution of Rock enjoyment does not differ for any of the levels of alcohol consumption. (𝜒2 =1.58, 𝑝< 0.05), the effect size was insignificantly small (𝜂2 = -0.0021). Post-Hoc tests revealed no differences for each pair of groups (𝑝> 0.01).

Homework 2 Students Attitudes

Paul Shenstone

2023-01-12