MMM WT 2023/24: Exercise 1

Author
Affiliation
Susanne Adler

Institute for Marketing, Ludwig-Maximilians-University Munich

Set-up

Load packages and data

# packages ####

library(dplyr)
library(ggplot2)

# read data ####

data <- openxlsx::read.xlsx("MMM_influencer_data.xlsx")

Descriptive analyses

Sample size

Get the number of rows to get the sample size

nrow(data) 
[1] 223

Age

data %>% 
  summarize(
    Mean = mean(Age, na.rm=TRUE),
    SD = sd(Age, na.rm=TRUE),
    n = n())
      Mean       SD   n
1 25.65022 7.404867 223
Rmisc::CI(data$Age) # confidence interval (95% by default)
   upper     mean    lower 
26.62743 25.65022 24.67302 
ggplot(data, # specify data frame
       aes(Age)) +  # specify aestethics, i.e., variables
  geom_histogram() + # plot a histogram
  theme_minimal() # use a minimalistic graph design
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Gender

table(data$Gender) # get a gender table

female   male  other 
   152     70      1 
table(data$Gender) %>% 
  prop.table() # table with proportions

     female        male       other 
0.681614350 0.313901345 0.004484305 
ggplot(data,
       aes(Gender)) +
         geom_bar() +
         theme_minimal()

Group info and tests

sample size per group

influencer group

table(data$InfluencerGroup)

    Emma Franklin 
     107      116 
table(data$InfluencerGroup) %>% 
  prop.table() # tablewith proportions

     Emma  Franklin 
0.4798206 0.5201794 
ggplot(data,
       aes(InfluencerGroup)) +
  geom_bar() +
  theme_minimal()

media group

table(data$MediaGroup)

Instagram   Youtube 
      111       112 
table(data$MediaGroup) %>% 
  prop.table() # tablewith proportions

Instagram   Youtube 
0.4977578 0.5022422 
ggplot(data,
       aes(MediaGroup)) +
  geom_bar() +
  theme_minimal()

influencer group x media group

table(data$group)

    Emma on Instagram       Emma on Youtube Franklin on Instagram 
                   53                    54                    58 
  Franklin on Youtube 
                   58 
ggplot(data,
       aes(group)) +
  geom_bar() +
  theme_minimal()

Age per group

Descriptives

data %>% 
  group_by(group) %>%
  summarize(
    Mean = mean(Age, na.rm=TRUE),
    SD = sd(Age, na.rm=TRUE),
    n = n())
# A tibble: 4 × 4
  group                  Mean    SD     n
  <chr>                 <dbl> <dbl> <int>
1 Emma on Instagram      24.5  3.97    53
2 Emma on Youtube        27.8  9.90    54
3 Franklin on Instagram  24.2  6.41    58
4 Franklin on Youtube    26.1  7.67    58
Rmisc::group.CI(Age~group,
                data)
                  group Age.upper Age.mean Age.lower
1     Emma on Instagram  25.58462 24.49057  23.39651
2       Emma on Youtube  30.53629 27.83333  25.13038
3 Franklin on Instagram  25.87468 24.18966  22.50463
4   Franklin on Youtube  28.15335 26.13793  24.12252

Possible plots:

## multiple histograms

ggplot(data,
       aes(Age)) +
  geom_histogram() +
  theme_minimal() +
  facet_wrap(~group) # display one subgraph per group
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## boxplots with points

ggplot(data,
       aes(group, Age)) + # assign group to x-axis and Age to y-axis
    geom_dotplot(binaxis = "y", stackdir = "center", # align points on y-axis and to center
                 alpha = 0.2, # points hould be transparent
                 dotsize = 0.3) + # points should be small
  geom_boxplot(alpha = 0) + # make a boxplot that is fulle transparent
  theme_minimal()
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

ggplot(Rmisc::group.CI(Age~group,
                       data), 
       aes(group, Age.mean)) +
  geom_dotplot(data = data, 
               aes(group, Age),
               binaxis = "y", 
               stackdir = "center", alpha = 0.3, dotsize = 0.5) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = Age.lower, ymax = Age.upper), size = 1) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Bin width defaults to 1/30 of the range of the data. Pick better value with
`binwidth`.

Test

This test performs a very basic (!) analysis of variance (ANOVA) witha Levene Test before and a Tukey post hoc test

car::leveneTest(Age~group, data)
Warning in leveneTest.default(y = y, group = group, ...): group coerced to
factor.
Levene's Test for Homogeneity of Variance (center = median)
       Df F value  Pr(>F)  
group   3  2.4303 0.06613 .
      219                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov_Age <- aov(Age~group, data)

summary(aov_Age)
             Df Sum Sq Mean Sq F value Pr(>F)  
group         3    466  155.39   2.907 0.0356 *
Residuals   219  11707   53.45                 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_Age)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = Age ~ group, data = data)

$group
                                                diff       lwr         upr
Emma on Youtube-Emma on Instagram          3.3427673 -0.317047  7.00258158
Franklin on Instagram-Emma on Instagram   -0.3009109 -3.897671  3.29584973
Franklin on Youtube-Emma on Instagram      1.6473650 -1.949396  5.24412559
Franklin on Instagram-Emma on Youtube     -3.6436782 -7.222995 -0.06436161
Franklin on Youtube-Emma on Youtube       -1.6954023 -5.274719  1.88391425
Franklin on Youtube-Franklin on Instagram  1.9482759 -1.566543  5.46309494
                                              p adj
Emma on Youtube-Emma on Instagram         0.0870775
Franklin on Instagram-Emma on Instagram   0.9964028
Franklin on Youtube-Emma on Instagram     0.6365083
Franklin on Instagram-Emma on Youtube     0.0442699
Franklin on Youtube-Emma on Youtube       0.6108474
Franklin on Youtube-Franklin on Instagram 0.4789081

Gender per group

Descriptives and plot

table(data$group, data$Gender)
                       
                        female male other
  Emma on Instagram         40   13     0
  Emma on Youtube           31   23     0
  Franklin on Instagram     38   20     0
  Franklin on Youtube       43   14     1
table(data$group, data$Gender) %>% prop.table(., 1) 
                       
                            female       male      other
  Emma on Instagram     0.75471698 0.24528302 0.00000000
  Emma on Youtube       0.57407407 0.42592593 0.00000000
  Franklin on Instagram 0.65517241 0.34482759 0.00000000
  Franklin on Youtube   0.74137931 0.24137931 0.01724138
ggplot(data, aes(group, fill = Gender)) + geom_bar(position = "fill") + theme_minimal()

Test

Performs a chi square test

Since chi square tests do not perform well of many cells = 0, we omit the category “other”

table(data[data$Gender != "other", "group"],
      data[data$Gender != "other", "Gender"]) %>% 
  chisq.test()

    Pearson's Chi-squared test

data:  .
X-squared = 5.7809, df = 3, p-value = 0.1228