library(ggplot2)

ANOVA

One Way ANOVA

Data generate

y_1 <- c(5,4,8,6,3)
y_2 <- c(9,7,8,6,9)
y_3 <- c(3,5,2,3,7)
y_4 <- c(2,3,4,1,4)
y_5 <- c(7,6,9,4,7)

y <- c(y_1, y_2, y_3, y_4, y_5)

group을 나눠준다(5X5)

n <- rep(5,5)

group <- rep(1:5,n)

group_df <- data.frame(y,group)

group_df
##    y group
## 1  5     1
## 2  4     1
## 3  8     1
## 4  6     1
## 5  3     1
## 6  9     2
## 7  7     2
## 8  8     2
## 9  6     2
## 10 9     2
## 11 3     3
## 12 5     3
## 13 2     3
## 14 3     3
## 15 7     3
## 16 2     4
## 17 3     4
## 18 4     4
## 19 1     4
## 20 4     4
## 21 7     5
## 22 6     5
## 23 9     5
## 24 4     5
## 25 7     5

group의 클래스를 요인(factor)으로 변경한다

group_df <- transform(group_df, group=factor(group))
class(group_df$group)
## [1] "factor"
attach(group_df)
## The following objects are masked _by_ .GlobalEnv:
## 
##     group, y

요인(group)간 분포의 차이 보기위해 boxplot 생성

boxplot(y~group,
        xlab="Factor Levels",
        ylab="delayed time for effect")
abline(h=mean(y), col='red')

ANOVA 모델 적합

detach(group_df)

aov_result <- aov(y~group, data=group_df)
aov_result
## Call:
##    aov(formula = y ~ group, data = group_df)
## 
## Terms:
##                 group Residuals
## Sum of Squares  79.44     57.60
## Deg. of Freedom     4        20
## 
## Residual standard error: 1.697056
## Estimated effects may be unbalanced

적합 결과 확인

aov_summ <- summary(aov(y~group,data=group_df))
aov_summ
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## group        4  79.44   19.86   6.896 0.00117 **
## Residuals   20  57.60    2.88                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F 분포 가설검정

qf(p=0.95, df1=4, df2=aov_result$df.residual)
## [1] 2.866081
F value(6.896)가 유의수준(0.05)에서의 임계값(2.866081)보다 크므로 귀무가설 기각
=> 적어도 한개 그룹의 평균은 전체 평균과 유의미하게 다르다

요인간 차이가 뚜렷하지 않으면?

set.seed(2019)
random_value <- sample(1:10,size=25,replace=T)
group_df <- cbind(group_df, random_value)
boxplot(random_value~group,
        xlab="Random Levels",
        ylab="delayed time for effect")
abline(h=mean(random_value), col='red')

aov_result <- aov(random_value~group, data=group_df)
aov_result
## Call:
##    aov(formula = random_value ~ group, data = group_df)
## 
## Terms:
##                 group Residuals
## Sum of Squares    7.6     196.4
## Deg. of Freedom     4        20
## 
## Residual standard error: 3.133688
## Estimated effects may be unbalanced
aov_summ <- summary(aov(random_value~group,data=group_df))
aov_summ
##             Df Sum Sq Mean Sq F value Pr(>F)
## group        4    7.6    1.90   0.193  0.939
## Residuals   20  196.4    9.82
귀무가설 기각하지 못한다
=> 모든 그룹의 평균은 같다. (유의미한 차이 없다)