library(expss); library(ISLR); data("Wage")
tbl_caption<-cro_mean_sd_n(Wage$wage, Wage$race) %>%
set_caption('不同種族的平均薪資、標準差')
tbl_caption
不同種族的平均薪資、標準差 | ||||
Wage$race | ||||
---|---|---|---|---|
1. White | 2. Black | 3. Asian | 4. Other | |
Wage$wage | ||||
Mean | 112.6 | 101.6 | 120.3 | 90 |
Std. dev. | 41.7 | 37.2 | 46.4 | 29.2 |
Unw. valid N | 2480 | 293 | 190 | 37 |
m1 <- aov(wage ~ race, data=ISLR::Wage )
summary(m1)
## Df Sum Sq Mean Sq F value Pr(>F)
## race 3 63212 21071 12.2 5.9e-08 ***
## Residuals 2996 5158874 1722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot
ggstatsplot::ggbetweenstats(
data = ISLR::Wage,
x = race,
y = wage,
xlab = "Race",
ylab = "Wage",
plot.type = 'box',
type = "parametric",
bf.message = F,
p.adjust.method = 'none')
Figure 1.1: 不同種族的薪資
\[H_{0}:\mu_{x}=\mu_{y}\]
\[H_{1}:\mu_{x}<\mu_{y},\mu_{x}>\mu_{y},\mu_{x}\neq \mu_{y}\]
\[T=\frac{(\bar{X}-\bar{Y})-E(\bar{X}-\bar{Y}|H_{0})}{SE(\bar{X}-\bar{Y}|H_{0})}\]
\[s_{pooled}^2=\frac{(n_{x}-1)s^{2}_{x}+(n_{y}-1)s^{2}_{y}}{n_{x}+n_{y}-2}\]
\[s_{pooled}^2=\frac{s^{2}_{x}+s^{2}_{y}}{2}\]
\[SE(\mu_{x}-\mu_{y})=s_{p}\sqrt{\frac{1}{n_{x}}+\frac{1}{n_{y}}}\]
\[SE(\mu_{x}-\mu_{y})=\sqrt{\frac{s_{x}^{2}}{n_{x}}+\frac{s_{y}^{2}}{n_{y}}}\]
\[SE(\mu_{x}-\mu_{y})=\sqrt{\frac{s_{x}^{2}+s_{y}^{2}}{n}}\]
sample1<-c(19.7475, 20.5562, 16.0734, 18.1730, 19.8387,
14.5291, 19.4998, 18.8374, 16.6873, 14.7627,
18.3869, 17.9287, 21.6973, 19.3439, 15.7374,
18.3563, 22.0878, 22.5745, 18.0030, 18.6004)
sample2<-c(19.4715, 18.8613, 17.4827, 21.6029, 23.0386,
20.6289, 19.9357, 19.8812, 16.6012, 21.7741,
21.9265, 18.4178, 20.4401, 20.1119, 22.9955,
21.1385, 19.4969, 20.4448, 19.6675, 17.0984)
dt=data.frame(sample1, sample2)
dt.m <- melt(dt) %>%
ggplot(aes(x=variable, y=value)) +
geom_boxplot() +
labs(x="Groups", y='') +
stat_summary(fun.y=mean, geom="point", shape=16, size=2, col='red')
dt.m
Figure 1.2: 兩套樣本的盒型圖
with(dt, plot(density(sample1), main='', col='#22ee00', ylim=c(0,0.25)))
with(dt, lines(density(sample2), lty=2, col='#aa11cc'))
Figure 1.3: 雙樣本機率密度圖
假設兩個母體的變異數相同。
根據以上公式,我們首先計算標準差以及標準誤:
n1<-length(sample1); n2<-length(sample2)
var1<-var(sample1); var2<-var(sample2)
sp=(var1+var2)/2
se <-sqrt(sp)*(sqrt(n1^-1+n2^-1)); se
## [1] 0.6422
T=(mean(sample1)-mean(sample2))/se
cat('T =',T, '\n')
## T = -2.304
Figure 1.4: 左尾累積機率圖
pvalue=pt(T, n1+n2-2)*2
cat('p value =', pvalue)
## p value = 0.02677
t.test(sample1, sample2, var.equal = T)
##
## Two Sample t-test
##
## data: sample1 and sample2
## t = -2.3, df = 38, p-value = 0.03
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.7798 -0.1797
## sample estimates:
## mean of x mean of y
## 18.57 20.05
由以上的結果可以看出,\(T\)=-2.304,\(p<0.05\),因此我們可以拒斥兩筆資料的平均數相等的假設。95%的區間是(-2.779, -0.179),不包含0。結論是兩種油漆的效果不相同。
我們改假設變異數不等,得到類似結果。
t.test(sample1, sample2, var.equal = F)
##
## Welch Two Sample t-test
##
## data: sample1 and sample2
## t = -2.3, df = 36, p-value = 0.03
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.7820 -0.1775
## sample estimates:
## mean of x mean of y
## 18.57 20.05
\[H_{0}:\mu_{x}>\mu_{y}\]
t.test(sample1, sample2, var.equal = T, alternative = 'less')
##
## Two Sample t-test
##
## data: sample1 and sample2
## t = -2.3, df = 38, p-value = 0.01
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.397
## sample estimates:
## mean of x mean of y
## 18.57 20.05
可以看出\(p< 0.05\)。我們拒斥第一筆資料平均數大於第二筆資料的平均數的假設,結論是第一組油漆的效果比較差。
雙樣本的檢定需要比較特別的標準誤以及自由度,檢定過程與單樣本的\(t\)檢定相同。
\[H_{0}: \mu_{boy}=\mu_{girl}\]
\[H_{1}: \mu_{boy}>\mu_{girl}\]
library(UsingR);data(babyboom)
male <- dplyr::filter(babyboom, gender=='boy')
female <- dplyr::filter(babyboom, gender=='girl')
t.test(male$wt, female$wt, alternative='greater')
##
## Welch Two Sample t-test
##
## data: male$wt and female$wt
## t = 1.4, df = 28, p-value = 0.08
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -48 Inf
## sample estimates:
## mean of x mean of y
## 3375 3132
\[H_{0}:p_{x}-p_{y}=0\]
\[H_{0}:p_{x}-p_{y}=D_{0}\]
\[\sqrt{\frac{p_{1}(1-p_{1})}{n_{1}}+\frac{p_{2}(1-p_{2})}{n_{2}}}\]
\[T=\frac{p_{1}-p_{2}}{SE}\]
x1=20; n1=50; x2=13; n2=61
prop.test(c(x1,x2), c(n1, n2), correct = FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: c(x1, x2) out of c(n1, n2)
## X-squared = 4.6, df = 1, p-value = 0.03
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.01659 0.35718
## sample estimates:
## prop 1 prop 2
## 0.4000 0.2131
例如病患服用藥物前後的病況、選手接受訓練前後的表現、受訪者第一次受訪跟第二次受訪的回答等等,適合進行成對樣本檢定。
用R
的\(\tt{t.test()}\)函數,可設定\(\tt{paired=T}\),進行成對樣本檢定,虛無假設為兩次測量的平均值相等。
例如比較民進黨候選人在台北市第2選區在2016年的立委選舉與2019年的立委補選的各里的平均得票率是否相同:
filetaipei<-here::here('data','taipei201619.csv')
taipei<-read.csv(filetaipei, header=T, fileEncoding = 'BIG5')
library(dplyr)
taipei<-taipei %>%
mutate(DPP2016p=100*DPP2016/total2016, DPP2019p=100*DPP2019/total2019)
with(taipei, t.test(DPP2016p, DPP2019p, paired = T))
##
## Paired t-test
##
## data: DPP2016p and DPP2019p
## t = 26, df = 196, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.07 12.90
## sample estimates:
## mean of the differences
## 11.98
結果顯示兩者有很大的差異,拒斥兩次選舉的平均值相減等於0的假設。
為了確認這個檢定結果,我們可以畫圖顯示兩次選舉的平均值。先整理成為長資料:
filetaipei<-here::here('data','taipei201619.csv')
taipei<-read.csv(filetaipei, header=T, fileEncoding = 'BIG5')
taipei<-taipei %>%
mutate(DPP2016p=100*DPP2016/total2016,
DPP2019p=100*DPP2019/total2019)
dat<- data.frame(taipei$DPP2016p, taipei$DPP2019p)
colnames(dat)<-c('2016','2019')
dat.new <- reshape2::melt(dat, variable.name = "election", value.name = "DPP")
dat.new <- dat.new %>%
group_by(election) %>%
mutate(grp.mean=mean(DPP)) %>%
mutate(Year=as.factor(election))
ggplot(dat.new, aes(x=DPP, color=Year)) +
geom_density(position="identity", alpha=.4) +
geom_vline(data=dat.new, aes(xintercept=grp.mean, color=Year),
linetype="dashed") +
labs(x="DPP's Vote Share", y="")
Figure 1.5: 民進黨2016大選及2019補選得票率
\[H_{0}: \mu_{a}=\mu_{b}=\mu_{c}\]
\[SST=\sum_{i=1}\sum_{k=1}(x_{ij}-\bar{\bar{x}})^2\]
\[\bar{\bar{x}}=\frac{\sum_{i=1}\sum_{k=1}x_{ik}}{n}\]
\[SSB=\sum n_{k}(\bar{x_{k}}-\bar{x})^2\]
組內變異也稱為刺激的變異(SSTr),也有人寫成SSF(sum of squares due to factor)。換句話說,是「組」這個變數帶來的差異。
然後是組內變異(SSW, sum of square within groups),為各組的變異數的加總。
\[\begin{align*} SSW & =\sum_{k}\sum (x_{ki}-\bar{x_{k}})^2 \\ & =\sum_{k}(n_{k}-1)s_{k}^2 \end{align*}\]
有人把SSW寫成SSE(error sum of squares),也就是組間差異解釋全部變異不足的部分就是SSE。
全部的變異量等於這兩者相加:
\[SST=SSB+SSW\]
\[MSB=MSF=SSB/df(B) \quad df(B)=k-1\]
\[MSW=MSE=SSW/df(W) \quad df(W)=n-1-(k-1)=n-k\]
\[F=\frac{MSB}{MSW}\]
使用\(F\)分佈而不是\(t\)分佈檢驗假設成立與否。英國統計學家兼生物學家羅納德·費雪(Ronald Aylmer Fisher)發明了\(F\)檢驗。
F 分佈的參數是兩個自由度,第一個自由度是\(k-1\),也就是分組數減去1,第二個自由度是\(n-k\),也就是個案數減去分組數。可畫圖 ?? 如下:
curve(df(x, df1=1, df2=2), from=0, to=5, ylab='')
Figure 2.1: 三種F分佈
curve(df(x, df1=5, df2=6), from=0, to=5, col='red', add=T)
Figure 2.2: 三種F分佈
curve(df(x, df1=20, df2=30), from=0, to=5, col='blue2', add=T)
abline(v=1, lwd=1.5, lty=2, col='green2')
legend('topright', c('df=1;df=2','df=5;df=6',
'df=20;df=30'),
col=c('black',"red", "blue2"),
lty=c(1,1,1))
Figure 2.3: 三種F分佈
圖 ??顯示自由度越大、\(F\)分佈圖形越接近常態分佈。
R
的\(F\)分佈有以下指令:
df(x, df1, df2)回傳x百分比的機率 (y軸的高度)
pf(q, df1, df2)回傳q百分比的累積機率
qf(p, df1, df2)回傳p值的\(F\)值
rf(n, df1, df2)回傳從\(F\)分佈抽樣的n個樣本
alpha=0.05
pf(1.8, 9, 50, lower.tail = T)
## [1] 0.9084
qf(1-alpha/2, 9, 50)
## [1] 2.381
curve( df(x, df1=9, df2=50),
xlim = c(0, 3),
ylab = "Density",
# main = "F機率密度",
col='#e2a1cc', lwd=2)
n=60; k=10; alpha=0.05
upperbound <- qf(1-alpha/2, k-1, n-k)
x<-seq(upperbound, 4, by=0.1)
y=df(x, k-1, n-k)
cord.x1<-c(upperbound, x, 4)
cord.y1<-c(0, y, 0)
polygon(cord.x1, cord.y1,col='#20aaee')
text(2.48, 0.14, "2.5%")
Figure 2.4: F檢定圖
\[\bar{X_{i}}-\bar{X_{j}}\pm t_{n-k,\alpha/2m}\sqrt{MSE}\sqrt{\frac{1}{n_{i}}+\frac{1}{n_{j}}}\]
Orange$tree<-factor(Orange$Tree, levels=c("1","2", "3", "4","5"))
mydata <- Orange %>%
group_by (tree) %>%
summarise(
count = n(),
avg = mean(circumference, na.rm = TRUE),
sd = sd(circumference, na.rm = TRUE)
)
kable(mydata) %>%
kable_styling(bootstrap_options = "striped", full_width = F)
tree | count | avg | sd |
---|---|---|---|
1 | 7 | 99.57 | 43.29 |
2 | 7 | 135.29 | 66.32 |
3 | 7 | 94.00 | 42.98 |
4 | 7 | 139.29 | 71.90 |
5 | 7 | 111.14 | 58.86 |
可以看出五種平均樹圍之間有一些差異。
用箱形圖 2.5 檢視五種類型的樹圍:
p=ggplot(Orange, aes(x=tree, y=circumference))
p+geom_boxplot(data=Orange, aes(color=tree)) +
theme_bw()
Figure 2.5: 五種樹的樹圍
G=ggplot(Orange, aes(x=circumference, fill=tree, alpha=0.2))
G+geom_density() +
xlim(c(0, 300)) +
theme_bw()
Figure 2.6: 箱型圖與機率密度
圖 2.6 顯示部分類別可能不是成常態分佈,而且離散程度可能不相等。
res.aov <- aov(circumference ~ tree, data = Orange)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## tree 4 11841 2960 0.88 0.49
## Residuals 30 100525 3351
結果顯示\(p>0.05\),無法拒斥五種平均樹圍相等的假設。
我們可以畫圖檢視變異數相等的假設:
plot(res.aov, 1, cex=1.5, lwd=1.5)
Figure 2.7: 變異數圖形
首先我們計算這5組的個案數。這邊要用到\(\tt{apply()}\)這個函式,它可以計算每一欄或者每一列的個案數、平均數、標準差等等,然後存成向量。
#organize data
tmp <- Orange
tmpdf<-data.frame(A=tmp$circumference[tmp$Tree==1],
B=tmp$circumference[tmp$Tree==2],
C=tmp$circumference[tmp$Tree==3],
D=tmp$circumference[tmp$Tree==4],
E=tmp$circumference[tmp$Tree==5])
tmpdf
## A B C D E
## 1 30 33 30 32 30
## 2 58 69 51 62 49
## 3 87 111 75 112 81
## 4 115 156 108 167 125
## 5 120 172 115 179 142
## 6 142 203 139 209 174
## 7 145 203 140 214 177
g.n<-apply(tmpdf, 2, function(x) length(x))
g.n
## A B C D E
## 7 7 7 7 7
n=sum(g.n); n
## [1] 35
k=length(g.n); k
## [1] 5
#分組平均
g.mean<-apply(tmpdf, 2, function(x) mean(na.omit(x)))
g.mean
## A B C D E
## 99.57 135.29 94.00 139.29 111.14
mean<-sum(tmpdf)/n
print(mean)
## [1] 115.9
SSB=sum((g.mean - mean)^2*g.n)
as.numeric(SSB)
## [1] 11841
g.s<-apply(tmpdf, 2, function(x) sd(x))
g.s2<-g.s^2
SSW=sum(g.s2*(g.n-1))
as.numeric(SSW)
## [1] 100525
MSB=SSB/(length(g.n)-1)
as.numeric(MSB)
## [1] 2960
MSW=SSW/(sum(g.n)-length(g.n))
as.numeric(MSW)
## [1] 3351
Fstats=MSB/MSW; as.numeric(Fstats)
## [1] 0.8834
pf(Fstats, k-1, n-k)
## [1] 0.5143
library(UsingR)
oneway.test(weight ~ feed, data=chickwts, var.equal = F)
##
## One-way analysis of means (not assuming equal variances)
##
## data: weight and feed
## F = 20, num df = 5, denom df = 30, p-value = 1e-08
結果顯示,\(p<0.05\),所以結論為不同飼料餵的雞的重量有不同的平均數。
我們也可以用\(\tt{aov()}\)函式再做一遍,得到的結果一致。
不過,大部份時候\(\tt{oneway.test()}\)與\(\tt{aov()}\)這兩個函式得到的結果不同,所以在操作時要小心。建議直接使用\(\tt{aov()}\)進行變異數分析。
library(UsingR)
m1<-aov(Speed ~ Expt, data=morley)
summary(m1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Expt 1 72581 72581 13 0.00048 ***
## Residuals 98 545444 5566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(UsingR)
m1<-aov(mpg ~ cyl, data=mtcars)
summary(m1)
## Df Sum Sq Mean Sq F value Pr(>F)
## cyl 1 818 818 79.6 6.1e-10 ***
## Residuals 30 308 10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
alpha=0.05;mse=10.3;m=3;k=3;n=32
tstar=qt(1-alpha/(2*m), n-k)
tstar
## [1] 2.541
mean.cyl<-mtcars %>% group_by(cyl) %>%
summarise(obs=n(), mmpg=mean(mpg))
mean.cyl4<-mean.cyl$mmpg[1]
mean.cyl6<-mean.cyl$mmpg[2]
mean.cyl8<-mean.cyl$mmpg[3]
obs.cyl4<-mean.cyl$obs[1]
obs.cyl6<-mean.cyl$obs[2]
obs.cyl8<-mean.cyl$obs[3]
#4, 6
cat("mu1_mu2=(",(mean.cyl4-mean.cyl6)-tstar*sqrt(mse)*
sqrt((1/obs.cyl4)+(1/obs.cyl6)),",",
(mean.cyl4-mean.cyl6)+tstar*sqrt(mse)*sqrt((1/obs.cyl4)+(1/obs.cyl6)),")","\n")
## mu1_mu2=( 2.978 , 10.86 )
#4,8
cat("mu1_mu3=(",(mean.cyl4-mean.cyl8)-tstar*sqrt(mse)*sqrt(1/obs.cyl4+1/obs.cyl8),",",
(mean.cyl4-mean.cyl8)+tstar*sqrt(mse)*sqrt(1/obs.cyl4+1/obs.cyl8),")","\n")
## mu1_mu3=( 8.278 , 14.85 )
#6,8
cat("mu2_mu3=(",(mean.cyl6-mean.cyl8)-tstar*sqrt(mse)*sqrt(1/obs.cyl6+1/obs.cyl8),",",
(mean.cyl6-mean.cyl8)+tstar*sqrt(mse)*sqrt(1/obs.cyl6+1/obs.cyl8),")")
## mu2_mu3=( 0.868 , 8.418 )
attach(mtcars)
pairwise.t.test(mpg, cyl, p.adj='bonf')
##
## Pairwise comparisons using t tests with pooled SD
##
## data: mpg and cyl
##
## 4 6
## 6 4e-04 -
## 8 3e-09 0.01
##
## P value adjustment method: bonferroni
R
另外提供Bartlett以及Levene兩種檢定變異數相等的函數。Bartlett檢驗的做法是:bartlett.test(mpg ~ cyl, data = mtcars)
##
## Bartlett test of homogeneity of variances
##
## data: mpg by cyl
## Bartlett's K-squared = 8.4, df = 2, p-value = 0.02
library(car)
leveneTest(mpg ~ as.factor(cyl), data = mtcars)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 5.51 0.0094 **
## 29
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
兩個檢驗結果都是拒斥變異數相等的假設。
雖然我們無法拒斥每一組平均數相等的假設,但是我們仍然可以用TukeyHSD函數檢驗哪些組別之間可能有差異:
A1<-aov(mpg ~ as.factor(cyl), data=mtcars)
TukeyHSD(A1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = mpg ~ as.factor(cyl), data = mtcars)
##
## $`as.factor(cyl)`
## diff lwr upr p adj
## 6-4 -6.921 -10.769 -3.0722 0.0003
## 8-4 -11.564 -14.771 -8.3565 0.0000
## 8-6 -4.643 -8.328 -0.9581 0.0112
kidiq <- here::here('data','kidiq.dta')
kid <- sjlabelled::read_stata(kidiq)
head(kid)
## kid_score mom_hs mom_iq mom_work mom_age
## 1 65 1 121.12 4 27
## 2 98 1 89.36 4 25
## 3 85 1 115.44 4 27
## 4 83 1 99.45 3 25
## 5 115 1 92.75 4 27
## 6 98 0 107.90 1 18
A2 <- aov(kid_score ~ mom_hs + mom_work, data=kid)
summary(A2)
## Df Sum Sq Mean Sq F value Pr(>F)
## mom_hs 1 10125 10125 25.65 6.1e-07 ***
## mom_work 1 144 144 0.37 0.55
## Residuals 431 170117 395
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Respondent | Candidate.1 | Candidate.2 |
---|---|---|
1 | 75 | 66 |
2 | 80 | 74 |
3 | 86 | 74 |
4 | 70 | 74 |
5 | 80 | 89 |
6 | 69 | 79 |
7 | 59 | 83 |
8 | 86 | 85 |
9 | 74 | 80 |
10 | 80 | 80 |
11 | 88 | 60 |
12 | 88 | 59 |
13 | 76 | 49 |
14 | 90 | 42 |
15 | 91 | 30 |
16 | 70 | 72 |
17 | 68 | 60 |
18 | 55 | 88 |
may=c(2166,1568,2233,1882,2019)
sep=c(2279,2075,2131,2009,1793)
dec=c(2226,2154,2583,2010,2190)
dt<-cbind(may, sep, dec)
knitr::kable(dt, 'simple', caption='里程數資料')
may | sep | dec |
---|---|---|
2166 | 2279 | 2226 |
1568 | 2075 | 2154 |
2233 | 2131 | 2583 |
1882 | 2009 | 2010 |
2019 | 1793 | 2190 |
type1 <- c(303, 393, 396, 399, 398)
type2 <- c(322, 326, 315, 318, 320, 320)
type3 <- c(125, 351, 226, 119, 280)
wear <- list(type1=type1,type2=type2,type3=type3)
wear.st <- stack(wear)
seven <- rbind(metro=c(10,13,14,16,17,NA),
city=c(5,8,10,11,12,14),
country=c(6,8,9,13,NA,NA))
kable(seven)%>%
kable_styling(bootstrap_options = "striped", full_width = F)
metro | 10 | 13 | 14 | 16 | 17 | NA |
city | 5 | 8 | 10 | 11 | 12 | 14 |
country | 6 | 8 | 9 | 13 | NA | NA |
## 最後更新日期 05/14/2021