ANOVA: сравнение средних в нескольких группах

Олеся Волченко

28 октября 2020

Что мы уже знаем

Бинарная Номинальная/ординальная Интервальная/отношений
Бинарная Тест хи-квадрат с поправкой Йетса Тест хи-квадрат t-test
Номинальная/ординальная Тест хи-квадрат Тест хи-квадрат ?
Интервальная/отношений t-test ? ?

ANOVA

Общая логика

Формулы

\[ SST = \sum_{i=1}^k \sum_{i=1}^n (x_{ij} - \bar{x})^2 \]

\[ SSW = \sum_{i=1}^k \sum_{i=1}^n (x_{ij} - \bar{x}_i)^2\]

\[ SSB = \sum_{i=1}^k \sum_{j=1}^n (\bar{x}_{i} - \bar{x})^2 \]

\[ F_{stat} = \frac{\frac{SSB}{m-1}}{\frac{SSW}{m(n-1)}}\]

https://www.khanacademy.org/math/statistics-probability/analysis-of-variance-anova-library/analysis-of-variance-anova/v/anova-1-calculating-sst-total-sum-of-squares

Ценности из ESS

library(foreign)
ESS5 <- read.spss("/Users/olesyavolchenko/Yandex.Disk.localized/datafiles/ESS/ESS5e03.sav", use.value.labels=T, to.data.frame=T)

imprich - Important to be rich, have money and expensive things

iprspot - Important to get respect from others

ESS5$Power <- (as.numeric(ESS5$imprich) + as.numeric(ESS5$iprspot))/2
hist(ESS5$Power)

Проверим, связана ли ценность власти и частота использования Интернета

library(knitr)
knitr::kable(data.frame(table(ESS5$netuse)))
Var1 Freq
No access at home or work 13145
Never use 6987
Less than once a month 655
Once a month 542
Several times a month 1341
Once a week 1731
Several times a week 6164
Every day 21771

Проверим, связана ли ценность власти и частота использования Интернета

par(mfrow = c(1, 1), mar = c(12, 4, 4, 1))
boxplot(ESS5$Power ~ ESS5$netuse, las = 2)

Проверим, связана ли ценность власти и частота использования Интернета

Проверяем, равны ли дисперсии

Нулевая гипотеза для теста Левина - дисперсии не различаются по группам

library(car)
## Loading required package: carData
leveneTest(ESS5$Power ~ ESS5$netuse)
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value  Pr(>F)  
## group     7  1.7338 0.09619 .
##       50671                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Делаем анову

oneway.test(ESS5$Power ~ ESS5$netuse, var.equal = T) 
## 
##  One-way analysis of means
## 
## data:  ESS5$Power and ESS5$netuse
## F = 8.6271, num df = 7, denom df = 50671, p-value = 1.281e-10

p-value < 0.05, следовательно, можем отвергнуть нулевую гипотезу.

Как описывать результаты?

oneway.test(ESS5$Power ~ ESS5$netuse, var.equal = T) 
## 
##  One-way analysis of means
## 
## data:  ESS5$Power and ESS5$netuse
## F = 8.6271, num df = 7, denom df = 50671, p-value = 1.281e-10

F(7, 50671) = 8.6271, p = 1.281e-10

Размер эффекта

library(sjstats)
## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car
omega_sq(aov(ESS5$Power ~ ESS5$netuse))
##          term omegasq
## 1 ESS5$netuse   0.001
eta_sq(aov(ESS5$Power ~ ESS5$netuse))
##          term etasq
## 1 ESS5$netuse 0.001

0.01 - 0.06 - Small
0.06 - 0.14 - Medium
> 0.14 - Large

Post hoc: Tukey test

Если дисперсии равны, то можно воспользоваться другой функцией для anova:

aov.out <- aov(ESS5$Power ~ ESS5$netuse) #another way to run ANOVA
summary(aov.out)
##                Df Sum Sq Mean Sq F value   Pr(>F)    
## ESS5$netuse     7     71  10.201   8.627 1.28e-10 ***
## Residuals   50671  59914   1.182                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1779 observations deleted due to missingness

На основе её выдачи рассчитать Tukey HSD (он подходит в ситуации, если дисперсии равны)

Tukey <- TukeyHSD(aov.out)
Tukey
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = ESS5$Power ~ ESS5$netuse)
## 
## $`ESS5$netuse`
##                                                          diff         lwr
## Never use-No access at home or work               0.034438857 -0.01533384
## Less than once a month-No access at home or work  0.007009052 -0.12755432
## Once a month-No access at home or work            0.075421751 -0.07139620
## Several times a month-No access at home or work  -0.050678300 -0.14650701
## Once a week-No access at home or work             0.059374870 -0.02629512
## Several times a week-No access at home or work   -0.010746411 -0.06242951
## Every day-No access at home or work              -0.056887776 -0.09400394
## Less than once a month-Never use                 -0.027429805 -0.16473286
## Once a month-Never use                            0.040982893 -0.10835009
## Several times a month-Never use                  -0.085117158 -0.18475635
## Once a week-Never use                             0.024936012 -0.06497603
## Several times a week-Never use                   -0.045185269 -0.10363157
## Every day-Never use                              -0.091326634 -0.13739655
## Once a month-Less than once a month               0.068412698 -0.12634438
## Several times a month-Less than once a month     -0.057687353 -0.21755589
## Once a week-Less than once a month                0.052365817 -0.10162820
## Several times a week-Less than once a month      -0.017755464 -0.15576253
## Every day-Less than once a month                 -0.063896829 -0.19713503
## Several times a month-Once a month               -0.126100051 -0.29641200
## Once a week-Once a month                         -0.016046881 -0.18085697
## Several times a week-Once a month                -0.086168162 -0.23614870
## Every day-Once a month                           -0.132309527 -0.27791388
## Once a week-Several times a month                 0.110053170 -0.01156096
## Several times a week-Several times a month        0.039931889 -0.06067522
## Every day-Several times a month                  -0.006209476 -0.10016828
## Several times a week-Once a week                 -0.070121281 -0.16110478
## Every day-Once a week                            -0.116262646 -0.19983576
## Every day-Several times a week                   -0.046141365 -0.09426890
##                                                           upr     p adj
## Never use-No access at home or work               0.084211550 0.4166564
## Less than once a month-No access at home or work  0.141572424 0.9999999
## Once a month-No access at home or work            0.222239705 0.7760026
## Several times a month-No access at home or work   0.045150406 0.7488226
## Once a week-No access at home or work             0.145044864 0.4143927
## Several times a week-No access at home or work    0.040936685 0.9984732
## Every day-No access at home or work              -0.019771613 0.0000922
## Less than once a month-Never use                  0.109873250 0.9988208
## Once a month-Never use                            0.190315876 0.9913403
## Several times a month-Never use                   0.014522037 0.1597082
## Once a week-Never use                             0.114848058 0.9907743
## Several times a week-Never use                    0.013261036 0.2700682
## Every day-Never use                              -0.045256713 0.0000001
## Once a month-Less than once a month               0.263169777 0.9639947
## Several times a month-Less than once a month      0.102181184 0.9583356
## Once a week-Less than once a month                0.206359830 0.9698963
## Several times a week-Less than once a month       0.120251603 0.9999368
## Every day-Less than once a month                  0.069341370 0.8319679
## Several times a month-Once a month                0.044211897 0.3252451
## Once a week-Once a month                          0.148763212 0.9999906
## Several times a week-Once a month                 0.063812374 0.6599377
## Every day-Once a month                            0.013294829 0.1068452
## Once a week-Several times a month                 0.231667297 0.1099855
## Several times a week-Several times a month        0.140538999 0.9313375
## Every day-Several times a month                   0.087749332 0.9999994
## Several times a week-Once a week                  0.020862220 0.2739248
## Every day-Once a week                            -0.032689537 0.0006540
## Every day-Several times a week                    0.001986165 0.0713892

Post hoc: Tukey test

Визуализируем результат

plot(Tukey)

Так лучше:

par(mfrow = c(1, 1), mar = c(4, 20, 5, 1))
plot(Tukey, las = 2, cex.axis = 0.5)

Post hoc: попарный t-test с поправкой Бонферонни

Если дисперсии не равны, используем попарный t-test с поправкой Бонферонни (но на самом деле, т.к. дисперсии равны, данный тест здесь использовать не нужно)

pairwise.t.test(ESS5$Power, ESS5$netuse, 
                adjust = "bonferroni") 
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  ESS5$Power and ESS5$netuse 
## 
##                        No access at home or work Never use
## Never use              0.64227                   -        
## Less than once a month 1.00000                   1.00000  
## Once a month           1.00000                   1.00000  
## Several times a month  1.00000                   0.21174  
## Once a week            0.64227                   1.00000  
## Several times a week   1.00000                   0.40159  
## Every day              9.2e-05                   5.3e-08  
##                        Less than once a month Once a month
## Never use              -                      -           
## Less than once a month -                      -           
## Once a month           1.00000                -           
## Several times a month  1.00000                0.47180     
## Once a week            1.00000                1.00000     
## Several times a week   1.00000                1.00000     
## Every day              1.00000                0.14128     
##                        Several times a month Once a week Several times a week
## Never use              -                     -           -                   
## Less than once a month -                     -           -                   
## Once a month           -                     -           -                   
## Several times a month  -                     -           -                   
## Once a week            0.14128               -           -                   
## Several times a week   1.00000               0.40159     -                   
## Every day              1.00000               0.00065     0.09162             
## 
## P value adjustment method: holm

Визуализация

Box plot

par(mfrow = c(1, 1), mar = c(3, 15, 4, 1))
boxplot(ESS5$Power ~ ESS5$netuse, las = 2, horizontal = T, ylab = "")

Ridgeline plot

library(ggridges)
library(ggplot2)
ggplot(ESS5, aes(x = Power, y = netuse)) +
  geom_density_ridges() +
  theme_ridges() + 
  theme(legend.position = "none")
## Picking joint bandwidth of 0.212
## Warning: Removed 1680 rows containing non-finite values (stat_density_ridges).

Boxplot из пакета sjPlot

library(sjPlot)
## #refugeeswelcome
plot_grpfrq(ESS5$Power, ESS5$netuse,  type = "box")
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

Чтобы разобраться, как работает ANOVA, сгененрируем данные

Представим, что у нас есть результаты теста по математике для трех групп: A, B и C.

Первая база данных

dat1 <- data.frame(cond = factor(rep(c("A","B", "C"), each = 200)), 
                  rating = c(rnorm(200, mean = -3),
                             rnorm(200, mean = 0), 
                             rnorm(200, mean = 3)))

Как это выглядит?

p1 <- ggplot(dat1, aes(x=rating, fill=cond)) + geom_density(alpha=.3) + 
  scale_x_continuous(limits = c(-15, 15))
p1

Вторая база данных

dat2 <- data.frame(cond = factor(rep(c("A","B", "C"), each = 200)), 
                  rating = c(rnorm(200, mean = -1),
                             rnorm(200, mean = 0), 
                             rnorm(200, mean = 1)))

Как это выглядит?

p2 <- ggplot(dat2, aes(x=rating, fill=cond)) + geom_density(alpha=.3) + 
  scale_x_continuous(limits = c(-15, 15))
p2

Третья база данных

dat3 <- data.frame(cond = factor(rep(c("A","B", "C"), each=200)),                    
                   rating = c(rnorm(200, mean = -3, sd = 3),
                              rnorm(200, mean = 0, sd = 3),
                              rnorm(200, mean = 3, sd = 3)))

Как это выглядит?

p3 <- ggplot(dat3, aes(x = rating, fill = cond)) + geom_density(alpha = .3) + 
  scale_x_continuous(limits = c(-15, 15))

p3

Четвёртая база данных

dat4 <- data.frame(cond = factor(rep(c("A","B", "C"), each=200)), 
                   rating = c(rnorm(200, mean = -1, sd = 3),
                              rnorm(200, mean = 0, sd = 3), 
                              rnorm(200, mean = 1, sd = 3)))

Как это выглядит?

mean = -1, 0, 1; sd = 3; normal distribution

p4 <- ggplot(dat4, aes(x = rating, fill = cond)) + 
  geom_density(alpha = .3) + 
  scale_x_continuous(limits = c(-15, 15)) +
  ylim(c(0, 0.5))
p4

Все графики вместе

library(gridExtra)
grid.arrange(p1,
  p2,
  p3,
  p4,
  nrow = 2)

Результаты теста для каждой из баз

anova1 <- oneway.test(dat1$rating ~ dat1$cond, var.equal = T) 
anova2 <- oneway.test(dat2$rating ~ dat2$cond, var.equal = T) 
anova3 <- oneway.test(dat3$rating ~ dat3$cond, var.equal = T) 
anova4 <- oneway.test(dat4$rating ~ dat4$cond, var.equal = T) 
anova1; anova2; anova3; anova4
## 
##  One-way analysis of means
## 
## data:  dat1$rating and dat1$cond
## F = 1767.5, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat2$rating and dat2$cond
## F = 191.02, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat3$rating and dat3$cond
## F = 191.96, num df = 2, denom df = 597, p-value < 2.2e-16
## 
##  One-way analysis of means
## 
## data:  dat4$rating and dat4$cond
## F = 30.938, num df = 2, denom df = 597, p-value = 1.643e-13

Размер эффекта

\[\eta^2\]

eta_sq(aov(dat1$rating ~ dat1$cond))
##        term etasq
## 1 dat1$cond 0.856
eta_sq(aov(dat2$rating ~ dat2$cond))
##        term etasq
## 1 dat2$cond  0.39
eta_sq(aov(dat3$rating ~ dat3$cond))
##        term etasq
## 1 dat3$cond 0.391
eta_sq(aov(dat4$rating ~ dat4$cond))
##        term etasq
## 1 dat4$cond 0.094

\[\omega^2\]

omega_sq(aov(dat1$rating ~ dat1$cond))
##        term omegasq
## 1 dat1$cond   0.855
omega_sq(aov(dat2$rating ~ dat2$cond))
##        term omegasq
## 1 dat2$cond   0.388
omega_sq(aov(dat3$rating ~ dat3$cond))
##        term omegasq
## 1 dat3$cond   0.389
omega_sq(aov(dat4$rating ~ dat4$cond))
##        term omegasq
## 1 dat4$cond   0.091

https://strengejacke.wordpress.com/2017/07/25/effect-size-statistics-for-anova-tables-rstats/

Сводная таблица

data1 data2 data3 data4
group mean 1 -3.000000 -1.0000000 -3.0000000 -1.0000000
group mean 2 0.000000 0.0000000 0.0000000 0.0000000
group mean 3 3.000000 1.0000000 3.0000000 1.0000000
SD 1.000000 1.0000000 3.0000000 3.0000000
F-statistic 1767.457157 191.0235392 191.9598828 30.9376187
p-value 0.000000 0.0000000 0.0000000 0.0000000
omega sq 0.854824 0.3877845 0.3889521 0.0907372

Последовательность действий