m1 <- aov(wage ~ race, data=ISLR::Wage )
summary(m1)
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 63212 21071 12.2 5.9e-08 ***
## Residuals 2996 5158874 1722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot
ggstatsplot::ggbetweenstats(
data = ISLR::Wage,
x = race,
y = wage,
xlab = "Race",
ylab = "Wage",
Bayes=F)
Figure 1.1: 不同種族的薪資
我們經常需要知道兩套樣本的差異是否達到統計上的顯著水準,例如兩種教學方法的成績、兩個餐廳的評價、兩個國家人民對於民主的滿意程度等等。
從兩個獨立母體抽樣,樣本的平均數差異(\(\bar{X_{1}}-\bar{X_{2}}\))的抽樣分配趨近於常態分配。
如果母體的變異數未知,只有樣本的變異數,我們可以用樣本變異數做為母體變藝術的估計。
那麼如何計算兩套樣本的標準誤?公式為:
\(SE(\bar{X_{1}}-\bar{X_{2}})=\sqrt{\frac{s_{1}^{2}}{n_{1}}+\frac{s_{1}^{2}}{n_{1}}}\)
如果\(n_{1}=n_{2}=n\), \(SE(\bar{X_{1}}-\bar{X_{2}})=\sqrt{\frac{s_{1}^{2}+s_{2}^{2}}{n}}\)
如果兩者樣本數不相等, \(SE(\bar{X_{1}}-\bar{X_{2}})=\frac{s_{1}+s_{2}}{2}\sqrt{\frac{1}{n_{1}}+\frac{1}{n_{2}}}\)
\(T=\frac{\bar{X_{1}}-\bar{X_{2}}}{SE(\bar{X_{1}}-\bar{X_{2}})}\)
sample1<-c(19.7475, 30.5562, 11.0734, 18.1730, 19.8387,
14.5291, 19.4998, 18.8374, 12.6873, 14.7627,
18.3869, 17.9287, 17.6973, 14.3439, 10.7374,
15.3563, 19.0878, 12.5745, 18.0030, 18.6004)
sample2<-c(17.4715, 9.8613, 23.4827, 13.6029, 20.0386,
19.6289, 24.9357, 17.8812, 12.6012, 9.7741,
19.9265, 16.4178, 20.4401, 15.1119, 7.9955,
5.1385, 22.4969, 17.4448, 17.6675, 7.0984)
dt=data.frame(sample1, sample2)
dt.m <- melt(dt) %>%
ggplot(aes(x=variable, y=value)) +
geom_boxplot() +
labs(x="Groups", y='') +
stat_summary(fun.y=mean, geom="point", shape=16, size=2, col='red')
dt.m
Figure 1.2: 兩套樣本的盒型圖
t.test(sample1, sample2, var.equal = T)
##
## Two Sample t-test
##
## data: sample1 and sample2
## t = 0.73, df = 38, p-value = 0.5
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.054 4.394
## sample estimates:
## mean of x mean of y
## 17.12 15.95
由以上的結果可以看出,\(p>0.05\),因此我們無法拒斥兩筆資料的平均數相等的假設。結論是兩種油漆的效果非常相近。
根據以上公式,我們再驗算一次:
n1<-length(sample1); n2<-length(sample2)
s1<-sd(sample1); s2<-sd(sample2)
se <-((s1+s2)/2)*(sqrt(n1^-1+n2^-1))
T=(mean(sample1)-mean(sample2))/se
cat('t =',T, '\n')
## t = 0.7414
pvalue=(1-pt(0.74, n1+n2-2))*2
cat('p value =', pvalue)
## p value = 0.4638
t.test(sample1, sample2, var.equal = T, alternative = 'less')
##
## Two Sample t-test
##
## data: sample1 and sample2
## t = 0.73, df = 38, p-value = 0.8
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf 3.855
## sample estimates:
## mean of x mean of y
## 17.12 15.95
可以看出\(p\geq 0.05\)。我們不拒斥第一套樣本等於第二套樣本的平均數的假設。
又或者用隨機抽樣的資料驗證第一套樣本平均值小於第二套樣本平均值。
set.seed(29393091)
x=rnorm(30)
set.seed(116)
y=rnorm(100, sqrt(2), 1)
t.test(x, y, alternative = 'greater')
##
## Welch Two Sample t-test
##
## data: x and y
## t = -5.4, df = 50, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -1.433 Inf
## sample estimates:
## mean of x mean of y
## 0.2474 1.3422
\[H_{0}:\mu_{1}-\mu_{2}=0\]
\[H_{0}:\mu_{1}-\mu_{2}=D_{0}\]
R
計算如下:x1=20; n1=50; x2=13; n2=61
prop.test(c(x1,x2), c(n1, n2))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(x1, x2) out of c(n1, n2)
## X-squared = 3.7, df = 1, p-value = 0.05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.001604 0.375375
## sample estimates:
## prop 1 prop 2
## 0.4000 0.2131
前面的雙樣本檢定假定兩套樣本互相獨立,但是有不同的平均值與標準差。如果是想要確定兩個變數來自於同一套樣本,兩次測量的平均值相等,則是進行成對樣本檢定。
例如病患服用藥物前後的病況、選手接受訓練前後的表現等等,適合進行成對樣本檢定。
用R
的t.test函數,可設定paired=T,進行成對樣本檢定,虛無假設為兩次測量的平均值相等。
例如比較民進黨候選人在台北市第2選區在2016年的立委選舉與2019年的立委補選的各里的平均得票率是否相同:
filetaipei<-here::here('data','taipei201619.csv')
taipei<-read.csv(filetaipei, header=T, fileEncoding = 'BIG5')
library(dplyr)
taipei<-taipei %>%
mutate(DPP2016.p=100*DPP2016/total2016, DPP2019.p=100*DPP2019/total2019)
with(taipei, t.test(DPP2016.p, DPP2019.p, paired = T))
##
## Paired t-test
##
## data: DPP2016.p and DPP2019.p
## t = 26, df = 196, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.07 12.90
## sample estimates:
## mean of the differences
## 11.98
結果顯示兩者有很大的差異,拒斥兩次選舉的平均值相減等於0的假設。
為了確認這個檢定結果,我們可以畫圖顯示兩次選舉的平均值。先整理成為長資料:
dat<- taipei %>%
select('2016'='DPP2016.p', '2019'='DPP2019.p')
dat.new <- melt(dat, variable.name = "election", value.name = "DPP")
dat.new <- dat.new %>%
group_by(election) %>%
mutate(grp.mean=mean(DPP)) %>%
mutate(Year=as.factor(election))
ggplot(dat.new, aes(x=DPP, color=Year)) +
geom_density(position="identity", alpha=.4) +
geom_vline(data=dat.new, aes(xintercept=grp.mean, color=Year),
linetype="dashed") +
labs(x="DPP's Vote Share", y="")
Figure 1.3: 民進黨2016大選及2019補選得票率
Two-way Anova(雙因子變異數分析)則是有兩個以上的類別變數,分析類別之間是否存在連續變數的平均值的差異。
Anova分析建立在三個假設:
\(SSB=\sum n_{k}(\bar{x_{k}}-\bar{x})^2\)
\(SSW=\sum_{k}\sum (x_{ki}-\bar{x_{k}})^2\)
全部的變異量等於這兩者相加: \(SST=SSB+SSW\)
每一組之間的平均離散程度用MSB(mean square between)表示,每一類別(或是每一組)內部的離散程度用MSW(mean square within)表示。計算方式為:
\(MSB=SSB/df(B) \hspace{.4cm} df(B)=k-1\)
\(MSW=SSW/df(W) \hspace{.4cm} df(W)=n-1-(k-1)=n-k\)
\(F=\frac{MSB}{MSW}\)
使用\(F\)分佈而不是\(t\)分佈檢驗假設成立與否。英國統計學家兼生物學家羅納德·費雪(Ronald Aylmer Fisher)發明了\(F\)檢驗。
F 分佈的參數是兩個自由度,可畫圖 ?? 如下:
curve(df(x, df1=1, df2=2), from=0, to=5, ylab='')
Figure 2.1: 三種F分佈
curve(df(x, df1=5, df2=6), from=0, to=5, col='red', add=T)
Figure 2.2: 三種F分佈
curve(df(x, df1=20, df2=30), from=0, to=5, col='blue2', add=T)
abline(v=1, lwd=1.5, lty=2, col='green2')
legend('topright', c('df=1;df=2','df=5;df=6',
'df=20;df=30'),
col=c('black',"red", "blue2"),
lty=c(1,1,1))
Figure 2.3: 三種F分佈
圖片顯示自由度越大、\(F\)分佈圖形越接近常態分佈。
R
的\(F\)分佈有以下指令:
rf(n, df1, df1)回傳從\(F\)分佈抽樣的n個樣本
以Orange這筆資料為例,我們想檢驗每一種類型的樹的樹圍是否相等。先整理資料,計算平均樹圍以及標準差:
Orange$tree<-factor(Orange$Tree, levels=c("1","2", "3", "4","5"))
library(dplyr)
mydata <- Orange %>%
group_by (tree) %>%
summarize(
count = n(),
avg = mean(circumference, na.rm = TRUE),
sd = sd(circumference, na.rm = TRUE)
)
kable(mydata) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
tree | count | avg | sd |
---|---|---|---|
1 | 7 | 99.57 | 43.29 |
2 | 7 | 135.29 | 66.32 |
3 | 7 | 94.00 | 42.98 |
4 | 7 | 139.29 | 71.90 |
5 | 7 | 111.14 | 58.86 |
可以看出五種平均樹圍之間有一些差異。
用箱形圖 2.4 檢視五種類型的樹圍:
p=ggplot(Orange, aes(x=tree, y=circumference))
p+geom_boxplot(data=Orange, aes(color=tree)) +
theme_bw()
Figure 2.4: 五種樹的樹圍
G=ggplot(Orange, aes(x=circumference, fill=tree, alpha=0.2))
G+geom_density() +
xlim(c(0, 300)) +
theme_bw()
Figure 2.5: 箱型圖與機率密度
圖 2.5 顯示部分類別可能不是成常態分佈,而且離散程度可能不相等。
最後用aov函數進行Anova分析:
res.aov <- aov(circumference ~ tree, data = Orange)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## tree 4 11841 2960 0.88 0.49
## Residuals 30 100525 3351
結果顯示\(p>0.05\),無法拒斥五種平均樹圍相等的假設。
我們可以畫圖檢視變異數相等的假設:
plot(res.aov, 1, cex=1.5, lwd=1.5)
Figure 2.6: 變異數圖形
圖 2.6 顯示,每一類別的平均值跟觀察值與該平均值之間的殘差(residual)並沒有明顯的關聯。因此,變異數相等的假設成立。
R
另外提供Bartlett以及Levene兩種檢定變異數相等的函數。Bartlett檢驗的做法是:
bartlett.test(circumference ~ tree, data = Orange)
##
## Bartlett test of homogeneity of variances
##
## data: circumference by tree
## Bartlett's K-squared = 2.5, df = 4, p-value = 0.7
library(car)
leveneTest(circumference ~ tree, data = Orange)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 4 0.59 0.67
## 30
兩個檢驗結果都是不拒斥變異數相等的假設。可以先進行這兩個檢驗,再進行變異數分析。
雖然我們無法拒斥每一組平均數相等的假設,但是我們仍然可以用TukeyHSD函數檢驗哪些組別之間可能有差異:
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = circumference ~ tree, data = Orange)
##
## $tree
## diff lwr upr p adj
## 2-1 35.714 -54.04 125.46 0.7766
## 3-1 -5.571 -95.32 84.18 0.9998
## 4-1 39.714 -50.04 129.46 0.7030
## 5-1 11.571 -78.18 101.32 0.9956
## 3-2 -41.286 -131.04 48.46 0.6726
## 4-2 4.000 -85.75 93.75 0.9999
## 5-2 -24.143 -113.89 65.61 0.9343
## 4-3 45.286 -44.46 135.04 0.5930
## 5-3 17.143 -72.61 106.89 0.9806
## 5-4 -28.143 -117.89 61.61 0.8910
alpha=0.01; cat("conf=0.99, mean=0, sd=1, Z=", qnorm(1-(alpha/2), 0, 1),"\n")
## conf=0.99, mean=0, sd=1, Z= 2.576
alpha=0.05; cat("conf=0.95, mean=0, sd=1, Z=", qnorm(1-alpha/2),"\n")
## conf=0.95, mean=0, sd=1, Z= 1.96
alpha=0.1; cat("conf=0.90, mean=0, sd=1, Z=", qnorm(1-alpha/2),"\n")
## conf=0.90, mean=0, sd=1, Z= 1.645
alpha=0.05; cat("conf=0.95, mean=1, sd=1, Z=", qnorm(1-alpha/2, 1, 1),"\n")
## conf=0.95, mean=1, sd=1, Z= 2.96
Figure 3.1: 信賴區間0.95常態分佈圖
cat("z=1.99, mean=0, sd=1, p=", pnorm(1.99, 0, 1, lower.tail = TRUE),"\n")
## z=1.99, mean=0, sd=1, p= 0.9767
cat("z=1.96, mean=0, sd=1, p=", pnorm(1.96, 0, 1, lower.tail = TRUE),"\n")
## z=1.96, mean=0, sd=1, p= 0.975
cat("z=1.68, mean=0, sd=1, p=", pnorm(1.68, 0, 1, lower.tail = TRUE),"\n")
## z=1.68, mean=0, sd=1, p= 0.9535
Figure 3.2: 左尾累積機率圖
cat("z=1.99, mean=0, sd=1, p=", pnorm(1.99, 0, 1, lower.tail = FALSE),"\n")
## z=1.99, mean=0, sd=1, p= 0.0233
cat("z=1.96, mean=0, sd=1, p=", pnorm(1.96, 0, 1, lower.tail = FALSE),"\n")
## z=1.96, mean=0, sd=1, p= 0.025
cat("z=1.68, mean=0, sd=1, p=", pnorm(1.68, 0, 1, lower.tail = FALSE),"\n")
## z=1.68, mean=0, sd=1, p= 0.04648
Figure 3.3: 右尾累積機率圖
n=10; alpha=0.01; cat("conf=0.99, n=10, T=", qt(1-alpha/2, n-1),"\n")
## conf=0.99, n=10, T= 3.25
n=10; alpha=0.05; cat("conf=0.95, n=10, T=", qt(1-alpha/2, n-1),"\n")
## conf=0.95, n=10, T= 2.262
n=10; alpha=0.1; cat("conf=0.90, n=10, T=", qt(1-alpha/2, n-1),"\n")
## conf=0.90, n=10, T= 1.833
n=30; alpha=0.05; cat("conf=0.95, n=30, T=", qt(1-alpha/2, n-1),"\n")
## conf=0.95, n=30, T= 2.045
n=10; alpha=0.01;cat("t=1.99, n=10, p=", pt(1.99, n-1),"\n")
## t=1.99, n=10, p= 0.9611
n=10; alpha=0.05;cat("t=1.96, n=10, p=", pt(1.96, n-1),"\n")
## t=1.96, n=10, p= 0.9592
n=10; alpha=0.1;cat("t=1.68, n=10, p=", pt(1.68, n-1),"\n")
## t=1.68, n=10, p= 0.9364
n=30; alpha=0.05;cat("t=1.68, n=30, p=", pt(1.68, n-1),"\n")
## t=1.68, n=30, p= 0.9481
n=10; alpha=0.01;cat("t=1.99, n=10, p=", pt(1.99, n-1, lower.tail = FALSE),"\n")
## t=1.99, n=10, p= 0.0389
n=10; alpha=0.05;cat("t=1.96, n=10, p=", pt(1.96, n-1, lower.tail = FALSE),"\n")
## t=1.96, n=10, p= 0.04082
n=10; alpha=0.1;cat("t=1.68, n=10, p=", pt(1.68, n-1, lower.tail = FALSE),"\n")
## t=1.68, n=10, p= 0.06363
n=30; alpha=0.05;cat("t=1.68, n=30, p=", pt(1.68, n-1, lower.tail = FALSE),"\n")
## t=1.68, n=30, p= 0.05185
set.seed(02138)
X<-rnorm(100, 0, 1)
t.test(X, conf.level=0.95)
##
## One Sample t-test
##
## data: X
## t = -0.81, df = 99, p-value = 0.4
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.2795 0.1168
## sample estimates:
## mean of x
## -0.08133
#manual computing
n=length(X)
mean.x<-mean(X)
SD <- sqrt(var(X)/n)
alpha=0.05
zstar<-qnorm(1-alpha/2)
cat("conf=0.95, interval estimate=[", mean.x-zstar*SD, ",",
mean.x+zstar*SD,"]")
## conf=0.95, interval estimate=[ -0.277 , 0.1144 ]
set.seed(02138)
X<-rbinom(100, 1, 0.5)#randomly drawing samples from binomial distribution
n=length(X)
s<-length(X[X==1])
prop.test(s, n, conf.level = 0.95)
##
## 1-sample proportions test with continuity correction
##
## data: s out of n, null probability 0.5
## X-squared = 0.49, df = 1, p-value = 0.5
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3609 0.5622
## sample estimates:
## p
## 0.46
#manual computing
phat<-mean(X)
SE <- sqrt(phat*(1-phat)/n)
alpha=0.05
tstar<-qt(1-alpha/2, df=n-1)
cat("conf=0.95, interval estimate=[", phat-tstar*SE, ",",
phat+tstar*SE,"]")
## conf=0.95, interval estimate=[ 0.3611 , 0.5589 ]
set.seed(02138)
X<-rbinom(100, 1, 0.5)#randomly drawing samples from binomial distribution
n=length(X)
s<-length(X[X==1])
binom.test(s, n, conf.level = 0.95)
##
## Exact binomial test
##
## data: s and n
## number of successes = 46, number of trials = 100, p-value = 0.5
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.3598 0.5626
## sample estimates:
## probability of success
## 0.46
Respondent | Candidate.1 | Candidate.2 |
---|---|---|
1 | 75 | 66 |
2 | 80 | 74 |
3 | 86 | 74 |
4 | 70 | 74 |
5 | 80 | 89 |
6 | 69 | 79 |
7 | 59 | 83 |
8 | 86 | 85 |
9 | 74 | 80 |
10 | 80 | 80 |
11 | 88 | 60 |
12 | 88 | 59 |
13 | 76 | 49 |
14 | 90 | 42 |
15 | 91 | 30 |
16 | 70 | 72 |
17 | 68 | 60 |
18 | 55 | 88 |
set.seed(377)
DV<-rnorm(1000, 0, 3)
X.1<-round(runif(1000, 1, 4))
X <- as.factor(X.1)
## 最後更新日期 04/14/2021