# packages ####
library(dplyr)
library(ggplot2)
# read data ####
data <- openxlsx::read.xlsx("MMM_influencer_data.xlsx")MMM WT 2023/24: Exercise 1
Set-up
Load packages and data
Descriptive analyses
Sample size
Get the number of rows to get the sample size
nrow(data) [1] 223
Age
data %>%
summarize(
Mean = mean(Age, na.rm=TRUE),
SD = sd(Age, na.rm=TRUE),
n = n()) Mean SD n
1 25.65022 7.404867 223
Rmisc::CI(data$Age) # confidence interval (95% by default) upper mean lower
26.62743 25.65022 24.67302
ggplot(data, # specify data frame
aes(Age)) + # specify aestethics, i.e., variables
geom_histogram() + # plot a histogram
theme_minimal() # use a minimalistic graph design`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Gender
table(data$Gender) # get a gender table
female male other
152 70 1
table(data$Gender) %>%
prop.table() # table with proportions
female male other
0.681614350 0.313901345 0.004484305
ggplot(data,
aes(Gender)) +
geom_bar() +
theme_minimal()Group info and tests
sample size per group
influencer group
table(data$InfluencerGroup)
Emma Franklin
107 116
table(data$InfluencerGroup) %>%
prop.table() # tablewith proportions
Emma Franklin
0.4798206 0.5201794
ggplot(data,
aes(InfluencerGroup)) +
geom_bar() +
theme_minimal()media group
table(data$MediaGroup)
Instagram Youtube
111 112
table(data$MediaGroup) %>%
prop.table() # tablewith proportions
Instagram Youtube
0.4977578 0.5022422
ggplot(data,
aes(MediaGroup)) +
geom_bar() +
theme_minimal()influencer group x media group
table(data$group)
Emma on Instagram Emma on Youtube Franklin on Instagram
53 54 58
Franklin on Youtube
58
ggplot(data,
aes(group)) +
geom_bar() +
theme_minimal()Age per group
Descriptives
data %>%
group_by(group) %>%
summarize(
Mean = mean(Age, na.rm=TRUE),
SD = sd(Age, na.rm=TRUE),
n = n())# A tibble: 4 × 4
group Mean SD n
<chr> <dbl> <dbl> <int>
1 Emma on Instagram 24.5 3.97 53
2 Emma on Youtube 27.8 9.90 54
3 Franklin on Instagram 24.2 6.41 58
4 Franklin on Youtube 26.1 7.67 58
Rmisc::group.CI(Age~group,
data) group Age.upper Age.mean Age.lower
1 Emma on Instagram 25.58462 24.49057 23.39651
2 Emma on Youtube 30.53629 27.83333 25.13038
3 Franklin on Instagram 25.87468 24.18966 22.50463
4 Franklin on Youtube 28.15335 26.13793 24.12252
Possible plots:
## multiple histograms
ggplot(data,
aes(Age)) +
geom_histogram() +
theme_minimal() +
facet_wrap(~group) # display one subgraph per group`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## boxplots with points
ggplot(data,
aes(group, Age)) + # assign group to x-axis and Age to y-axis
geom_dotplot(binaxis = "y", stackdir = "center", # align points on y-axis and to center
alpha = 0.2, # points hould be transparent
dotsize = 0.3) + # points should be small
geom_boxplot(alpha = 0) + # make a boxplot that is fulle transparent
theme_minimal()Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
ggplot(Rmisc::group.CI(Age~group,
data),
aes(group, Age.mean)) +
geom_dotplot(data = data,
aes(group, Age),
binaxis = "y",
stackdir = "center", alpha = 0.3, dotsize = 0.5) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = Age.lower, ymax = Age.upper), size = 1) +
theme_minimal()Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.
Test
This test performs a very basic (!) analysis of variance (ANOVA) witha Levene Test before and a Tukey post hoc test
car::leveneTest(Age~group, data)Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 3 2.4303 0.06613 .
219
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov_Age <- aov(Age~group, data)
summary(aov_Age) Df Sum Sq Mean Sq F value Pr(>F)
group 3 466 155.39 2.907 0.0356 *
Residuals 219 11707 53.45
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_Age) Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = Age ~ group, data = data)
$group
diff lwr upr
Emma on Youtube-Emma on Instagram 3.3427673 -0.317047 7.00258158
Franklin on Instagram-Emma on Instagram -0.3009109 -3.897671 3.29584973
Franklin on Youtube-Emma on Instagram 1.6473650 -1.949396 5.24412559
Franklin on Instagram-Emma on Youtube -3.6436782 -7.222995 -0.06436161
Franklin on Youtube-Emma on Youtube -1.6954023 -5.274719 1.88391425
Franklin on Youtube-Franklin on Instagram 1.9482759 -1.566543 5.46309494
p adj
Emma on Youtube-Emma on Instagram 0.0870775
Franklin on Instagram-Emma on Instagram 0.9964028
Franklin on Youtube-Emma on Instagram 0.6365083
Franklin on Instagram-Emma on Youtube 0.0442699
Franklin on Youtube-Emma on Youtube 0.6108474
Franklin on Youtube-Franklin on Instagram 0.4789081
Gender per group
Descriptives and plot
table(data$group, data$Gender)
female male other
Emma on Instagram 40 13 0
Emma on Youtube 31 23 0
Franklin on Instagram 38 20 0
Franklin on Youtube 43 14 1
table(data$group, data$Gender) %>% prop.table(., 1)
female male other
Emma on Instagram 0.75471698 0.24528302 0.00000000
Emma on Youtube 0.57407407 0.42592593 0.00000000
Franklin on Instagram 0.65517241 0.34482759 0.00000000
Franklin on Youtube 0.74137931 0.24137931 0.01724138
ggplot(data, aes(group, fill = Gender)) + geom_bar(position = "fill") + theme_minimal()Test
Performs a chi square test
Since chi square tests do not perform well of many cells = 0, we omit the category “other”
table(data[data$Gender != "other", "group"],
data[data$Gender != "other", "Gender"]) %>%
chisq.test()
Pearson's Chi-squared test
data: .
X-squared = 5.7809, df = 3, p-value = 0.1228