lung.cancer <-matrix(c(7,61,55,129,489,570,475,431,293,154,38,12), nrow=2)
dimnames(lung.cancer) <-list(group=c("case","control"), smoking =c("0","<5", "5-14","15-24","25-49","50+" ))
hiv.edu <-matrix(c(52,62,53,54,18,25,79,153,213,231,46,139,342,417,629,571,139,330,226,262,375,244,74,116), nrow=6)
dimnames(hiv.edu) <-list(education=c("고교중퇴","고졸","대학중퇴","대졸", "대학원중퇴","대학원졸"),참여도= c("절대안함","아마도 안함 ","아마도 함" , "반드시 함 " ))
chisq.test(lung.cancer)
##
## Pearson's Chi-squared test
##
## data: lung.cancer
## X-squared = 137.7, df = 5, p-value < 2.2e-16
chisq.test(hiv.edu)
##
## Pearson's Chi-squared test
##
## data: hiv.edu
## X-squared = 89.72, df = 15, p-value = 1.117e-12
여러 모집단 비의 경향성에 관한 검정
예제: 임신부의 신발 사이즈와 제왕절개수술 여부와의 관련성에 관한 검정 (Altman, 1991, p.229)
library(ISwR)
caesar.shoe
## <4 4 4.5 5 5.5 6+
## Yes 5 7 6 7 8 10
## No 17 28 36 41 46 140
caesar.shoe.yes <- caesar.shoe["Yes",]
caesar.shoe.total <- margin.table(caesar.shoe,2)
caesar.shoe.yes
## <4 4 4.5 5 5.5 6+
## 5 7 6 7 8 10
caesar.shoe.total
## <4 4 4.5 5 5.5 6+
## 22 35 42 48 54 150
prop.test(caesar.shoe.yes,caesar.shoe.total)
## Warning: Chi-squared approximation may be incorrect
##
## 6-sample test for equality of proportions without continuity
## correction
##
## data: caesar.shoe.yes out of caesar.shoe.total
## X-squared = 9.287, df = 5, p-value = 0.09814
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3 prop 4 prop 5 prop 6
## 0.22727 0.20000 0.14286 0.14583 0.14815 0.06667
prop.trend.test(caesar.shoe.yes,caesar.shoe.total)
##
## Chi-squared Test for Trend in Proportions
##
## data: caesar.shoe.yes out of caesar.shoe.total ,
## using scores: 1 2 3 4 5 6
## X-squared = 8.024, df = 1, p-value = 0.004617
library("epicalc")
## Loading required package: foreign
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:ISwR':
##
## lung
##
## Loading required package: MASS
## Loading required package: nnet
csi(17, 122, 31, 109)
##
## Exposure
## Outcome Non-exposed Exposed Total
## Negative 109 122 231
## Positive 31 17 48
## Total 140 139 279
##
## Rne Re Rt
## Risk 0.22 0.12 0.17
## Estimate Lower95ci Upper95ci
## Risk difference (Re - Rne) -0.1 -0.19 -0.01
## Risk ratio 0.55 0.31 0.98
## Protective efficacy =(Rne-Re)/Rne*100 44.8 2.18 69.08
## or percent of risk reduced
## Number needed to treat (NNT) 10.09 5.34 97.16
## or -1/(risk difference)
cci(484,385,27,90)
##
## Exposure
## Outcome Non-exposed Exposed Total
## Negative 90 385 475
## Positive 27 484 511
## Total 117 869 986
##
## OR = 4.19
## Exact 95% CI = 2.63, 6.84
## Chi-squared = 43.95, 1 d.f., P value = 0
## Fisher's exact test (2-sided) P value = 0
csi (104,391,66,340)
##
## Exposure
## Outcome Non-exposed Exposed Total
## Negative 340 391 731
## Positive 66 104 170
## Total 406 495 901
##
## Rne Re Rt
## Risk 0.16 0.21 0.19
##
## Estimate Lower95ci Upper95ci
## Risk difference (attributable risk) 0.05 0 0.1
## Risk ratio 1.29 0.97 1.72
## Attr. frac. exp. -- (Re-Rne)/Re 0.23
## Attr. frac. pop. -- (Rt-Rne)/Rt*100 % 13.84
## Number needed to harm (NNH) 21.04 -301.71 10.07
## or 1/(risk difference)
McNemar test
hivnet <-matrix(c(251,68,178,98),nrow=2, dimnames=list("baseline"=c("Incorrect","correct"),"6 month"=c("Incorrect","correct")))
mcnemar.test(hivnet)
##
## McNemar's Chi-squared test with continuity correction
##
## data: hivnet
## McNemar's chi-squared = 48.3, df = 1, p-value = 3.664e-12
Fisher’s exact test
TeaTasting <-
matrix(c(3, 1, 1, 3),
nrow = 2,
dimnames = list(Guess = c("Milk", "Tea"),
Truth = c("Milk", "Tea")))
fisher.test(TeaTasting, alternative = "greater")
##
## Fisher's Exact Test for Count Data
##
## data: TeaTasting
## p-value = 0.2429
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
## 0.3136 Inf
## sample estimates:
## odds ratio
## 6.408
Simpson’s paradox
require(graphics)
## Data aggregated over departments
apply(UCBAdmissions, c(1, 2), sum)
## Gender
## Admit Male Female
## Admitted 1198 557
## Rejected 1493 1278
mosaicplot(apply(UCBAdmissions, c(1, 2), sum),
main = "Student admissions at UC Berkeley")
## Data for individual departments
opar <- par(mfrow = c(2, 3), oma = c(0, 0, 2, 0))
for(i in 1:6)
mosaicplot(UCBAdmissions[,,i],
xlab = "Admit", ylab = "Sex",
main = paste("Department", LETTERS[i]))
mtext(expression(bold("Student admissions at UC Berkeley")),
outer = TRUE, cex = 1.5)
par(opar)
Stratified analysis
data(Oswego)
use(Oswego)
cc(ill, chocolate)
##
## chocolate
## ill FALSE TRUE Total
## FALSE 7 22 29
## TRUE 20 25 45
## Total 27 47 74
##
## OR = 0.4
## Exact 95% CI = 0.12, 1.23
## Chi-squared = 3.14, 1 d.f., P value = 0.076
## Fisher's exact test (2-sided) P value = 0.089
mhor(ill, chocolate, sex)
##
## Stratified analysis by sex
## OR lower lim. upper lim. P value
## sex F 0.417 0.0617 2.06 0.3137
## sex M 0.331 0.0512 1.83 0.2635
## M-H combined 0.364 0.1258 1.05 0.0611
##
## M-H Chi2(1) = 3.51 , P value = 0.061
## Homogeneity test, chi-squared 1 d.f. = 0.05 , P value = 0.827
require(ISLR)
## Loading required package: ISLR
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922 Min. :-4.922 Min. :-4.922
## 1st Qu.:2002 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:-0.640
## Median :2003 Median : 0.039 Median : 0.039 Median : 0.038
## Mean :2003 Mean : 0.004 Mean : 0.004 Mean : 0.002
## 3rd Qu.:2004 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.: 0.597
## Max. :2005 Max. : 5.733 Max. : 5.733 Max. : 5.733
## Lag4 Lag5 Volume Today
## Min. :-4.922 Min. :-4.922 Min. :0.356 Min. :-4.922
## 1st Qu.:-0.640 1st Qu.:-0.640 1st Qu.:1.257 1st Qu.:-0.640
## Median : 0.038 Median : 0.038 Median :1.423 Median : 0.038
## Mean : 0.002 Mean : 0.006 Mean :1.478 Mean : 0.003
## 3rd Qu.: 0.597 3rd Qu.: 0.597 3rd Qu.:1.642 3rd Qu.: 0.597
## Max. : 5.733 Max. : 5.733 Max. :3.152 Max. : 5.733
## Direction
## Down:602
## Up :648
##
##
##
##
pairs(Smarket,col=Smarket$Direction)
Logistic regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.45 -1.20 1.07 1.15 1.33
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600 0.24074 -0.52 0.60
## Lag1 -0.07307 0.05017 -1.46 0.15
## Lag2 -0.04230 0.05009 -0.84 0.40
## Lag3 0.01109 0.04994 0.22 0.82
## Lag4 0.00936 0.04997 0.19 0.85
## Lag5 0.01031 0.04951 0.21 0.83
## Volume 0.13544 0.15836 0.86 0.39
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1742
##
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response")
glm.probs[1:5]
## 1 2 3 4 5
## 0.5071 0.4815 0.4811 0.5152 0.5108
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
mean(glm.pred==Direction)
## [1] 0.5216
Make training and test set
train = Year<2005
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred==Direction.2005)
## [1] 0.4802
Fit smaller model
glm.fit=glm(Direction~Lag1+Lag2,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595
106/(76+106)
## [1] 0.5824