library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

Exercise 7.1

Find the smallest sample size giving power of at least .7 when testing equality of six groups at the .05 level when \(ζ=4n\).

#7.1
g = 6                     # 分組
n = 2                     # 每組個數
N = n*g                   # 總個數
F = qf(0.95,g-1,N-g)      # a=0.05 虛無假設下臨界點之F值
power=1-pf(F,g-1,N-g,4*n) # 檢定力power

while(power<0.7){
  n = n+1
  N = n*g
  F = qf(0.95,g-1,N-g)
  power = 1-pf(F,g-1,N-g,4*n)
}
n # 每組最小個數
## [1] 4
N # 總樣本數
## [1] 24
power #最小檢定力
## [1] 0.7640361
#plot
seq <- seq(0,0.995,by=0.005)
q <- qf(seq,g-1,N-g) 
dp <- df(q,g-1,N-g) 
nq <- qf(seq,g-1,N-g,4*n)
ndp <- df(nq,g-1,N-g,4*n)
plot(dp~q,type="l",main="F 機率密度圖",xlab="x",ylab = "pdf",xlim=c(0,15),ylim=c(0,0.8),col="#0000FF",lwd=3)
lines(ndp~nq,type="l",col="#00FF00",lwd=3)
abline(v=F,col ="#FFFF00",lwd=3)

#ggplot
data <- cbind(q,dp,nq,ndp)
colnames(data) <- c("H0_x","H0_y","H1_x","H1_y")
data <- as.data.frame(data)
ggplot(data)+ labs(title = "F distribution (central & non-central)",size=20,x="x",y="pdf")+
  theme(axis.text=element_text(size=12),title=element_text(size=20))+
  geom_line(mapping = aes(x=H0_x,y=H0_y),col="#00EE00",lwd=1.5)+
  geom_line(mapping = aes(x=H1_x,y=H1_y),col="#00BBFF",lwd=1.5)+
  geom_vline(mapping = aes(xintercept=F),col="#FF0000",lwd=2,linetype=2)+
  scale_x_continuous(breaks = c(0:2,F,3:15),labels = c(0:2,"F",3:15))+
  theme(panel.background = element_rect(fill ="#FFFFF0", colour = "black",size = 2))

Exercise 7.3

What is the probability of rejecting the null hypothesis when there are four groups, the sum of the squared treatment effects is 6, the error variance is 3, the group sample sizes are 4, and ε is .01?

#7.3
g = 4                   # 分組組數
n = 4                   # 每組個數
N = 16                  # 總樣本數
sigma = 3               # error variance
alphasq = 6             # 給定的處理平方和
E1 = 0.01               # 顯著水準
zeta = (alphasq*n)/sigma          # 非中心參數
F = qf(1-E1,g-1,N-g)              # 虛無假設下之臨界值
Power = 1-pf(F,g-1,N-g,zeta)      # 檢定力POWER
#plot
seq <- seq(0,0.995,by=0.005)
q <- qf(seq,g-1,N-g) 
dp <- df(q,g-1,N-g) 
nq <- qf(seq,g-1,N-g,zeta)
ndp <- df(nq,g-1,N-g,zeta)
#ggplot
data <- cbind(q,dp,nq,ndp)
colnames(data) <- c("H0x","H0y","H1x","H1y")
data <- as.data.frame(data)
ggplot(data)+ labs(title = "F distribution (central & non-central)",size=40,x="x",y="pdf")+
  theme(axis.text=element_text(size=12),title=element_text(size=20))+
  geom_line(mapping = aes(x=H0x,y=H0y),col="#00EE00",lwd=1.5)+
  geom_line(mapping = aes(x=H1x,y=H1y),col="#00BBFF",lwd=1.5)+
  geom_vline(mapping = aes(xintercept=F),col="#FF0000",lwd=1.7,linetype=2)+
  scale_x_continuous(breaks = c(0:5,F,7:21),labels = c(0:5,"F",7:21),limits = c(0,21))+
  theme(panel.background = element_rect(fill ="#FFFFF0", colour = "black",size = 2))

Problem 7.2

Nondigestible carbohydrates can be used in diet foods, but they may have effects on colonic hydrogen production in humans. We want to test to see if inulin, fructooligosaccharide, and lactulose are equivalent in their hydrogen production. Preliminary data suggest that the treatmentmeans could be about 45, 32, and 60 respectively, with the error variance conservatively estimated at 35. How many subjects do we need to have power .95 for this situation when testing at the \(ε_I = .01\) level?

#problem7.2
n = 2                   # 每組的樣本數
g = 3                   # 組數
sigma = 35              # 給定的誤差變異數
ybar=c(45,32,60)        # 每組的估計值
a = (ybar-mean(ybar))
asq = sum(a^2)
zeta = 0               
power = 0
while(power<0.95){
  F = qf(0.99,g-1,n*3-g)
  zeta = (n*asq)/sigma
  power = 1-pf(F,g-1,n*3-g,ncp=zeta)
  n = n+1
}
N=n*g;N                 #檢定力超過0.95的最低樣本數
## [1] 15
power                   #最後的檢定力
## [1] 0.9808418
#plot
seq <- seq(0,0.995,by=0.005)
q <- qf(seq,g-1,N-g) 
dp <- df(q,g-1,N-g) 
nq <- qf(seq,g-1,N-g,zeta)
ndp <- df(nq,g-1,N-g,zeta)
#ggplot
data <- cbind(q,dp,nq,ndp)
colnames(data) <- c("H0x","H0y","H1x","H1y")
data <- as.data.frame(data)
ggplot(data)+ labs(title = "F distribution (central & non-central)",x="x",y="pdf")+
  theme(axis.text=element_text(size=12),title=element_text(size=20))+
  geom_line(mapping = aes(x=H1x,y=H1y),col="#00BBFF",lwd=1.5)+
  geom_vline(mapping = aes(xintercept=F),col="#FF0000",lwd=2,linetype=2)+
  theme(panel.background = element_rect(fill ="#FFFFF0", colour = "black",size = 2))

Exercise 6.3

# Exercise 6.3
library(dplyr)
x  <- c(18.2,16.4,10.0,13.5,13.5,6.7,12.2,18.2,13.5,16.4,
        5.5,12.2,11.0,6.7,16.4,8.2,7.4,12.2,6.7,11.0,
        5.5,5.0,8.2,9.0,10.0,6.0,7.4,5.5,12.2,8.2,
        6.0,7.4,12.2,11.0,5.0,7.4,7.4,5.5,6.7,5.5)
A = factor(rep(1:4, each=10)) # 建立框架變數 ,為每個 X 樣本分別標上 1, 2, 3, 4 等標記
data <- data.frame(x,A)
aov.data = aov(x~A, data)     # 進行「變異數分析」,看看 X 與 A 之間是否相關
summary(aov.data)             # 印出「變異數分析」的結果報表
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## A            3  265.5   88.49   9.865 6.91e-05 ***
## Residuals   36  322.9    8.97                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(data$x~data$A)           # 繪出 X~A 的盒狀圖

ggplot(data,aes(A,x))+
  geom_boxplot(aes(colour=A))

# 事後檢定 用Tukey法檢定兩兩之間是否有顯著性差異
TukeyHSD(aov.data,conf.level=0.95)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = x ~ A, data = data)
## 
## $A
##      diff        lwr        upr     p adj
## 2-1 -4.13  -7.737394 -0.5226057 0.0195604
## 3-1 -6.16  -9.767394 -2.5526057 0.0002860
## 4-1 -6.45 -10.057394 -2.8426057 0.0001496
## 3-2 -2.03  -5.637394  1.5773943 0.4390060
## 4-2 -2.32  -5.927394  1.2873943 0.3224748
## 4-3 -0.29  -3.897394  3.3173943 0.9963493
plot(TukeyHSD(aov.data,conf.level=0.95))

# 常態性檢定
residuals <- lm(x~.,data)$residuals
shapiro.test(residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.97947, p-value = 0.6699
# 獨立性檢定
a <- table(data)
chisq.test(table(data))
## Warning in chisq.test(table(data)): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(data)
## X-squared = 43.6, df = 36, p-value = 0.1796
chisq.test(a, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  a
## X-squared = 43.6, df = NA, p-value = 0.1609
# 同質性檢定
bartlett.test(x~A) # 對常態假設敏感
## 
##  Bartlett test of homogeneity of variances
## 
## data:  x by A
## Bartlett's K-squared = 2.8347, df = 3, p-value = 0.4178