범주형 자료 분석

Chisq test

lung.cancer <-matrix(c(7,61,55,129,489,570,475,431,293,154,38,12), nrow=2)
dimnames(lung.cancer) <-list(group=c("case","control"), smoking =c("0","<5", "5-14","15-24","25-49","50+" ))

hiv.edu <-matrix(c(52,62,53,54,18,25,79,153,213,231,46,139,342,417,629,571,139,330,226,262,375,244,74,116), nrow=6)
dimnames(hiv.edu) <-list(education=c("고교중퇴","고졸","대학중퇴","대졸", "대학원중퇴","대학원졸"),참여도= c("절대안함","아마도 안함 ","아마도 함" , "반드시 함 " ))

chisq.test(lung.cancer) 
## 
##  Pearson's Chi-squared test
## 
## data:  lung.cancer
## X-squared = 137.7, df = 5, p-value < 2.2e-16
chisq.test(hiv.edu)
## 
##  Pearson's Chi-squared test
## 
## data:  hiv.edu
## X-squared = 89.72, df = 15, p-value = 1.117e-12

여러 모집단 비의 경향성에 관한 검정

예제: 임신부의 신발 사이즈와 제왕절개수술 여부와의 관련성에 관한 검정 (Altman, 1991, p.229)

library(ISwR)
caesar.shoe
##     <4  4 4.5  5 5.5  6+
## Yes  5  7   6  7   8  10
## No  17 28  36 41  46 140
caesar.shoe.yes <- caesar.shoe["Yes",]
caesar.shoe.total <- margin.table(caesar.shoe,2)
caesar.shoe.yes
##  <4   4 4.5   5 5.5  6+ 
##   5   7   6   7   8  10
caesar.shoe.total
##  <4   4 4.5   5 5.5  6+ 
##  22  35  42  48  54 150
  • Chi-squared test
prop.test(caesar.shoe.yes,caesar.shoe.total)
## Warning: Chi-squared approximation may be incorrect
## 
##  6-sample test for equality of proportions without continuity
##  correction
## 
## data:  caesar.shoe.yes out of caesar.shoe.total
## X-squared = 9.287, df = 5, p-value = 0.09814
## alternative hypothesis: two.sided
## sample estimates:
##  prop 1  prop 2  prop 3  prop 4  prop 5  prop 6 
## 0.22727 0.20000 0.14286 0.14583 0.14815 0.06667
  • Cochran -Armitage trend test
prop.trend.test(caesar.shoe.yes,caesar.shoe.total)
## 
##  Chi-squared Test for Trend in Proportions
## 
## data:  caesar.shoe.yes out of caesar.shoe.total ,
##  using scores: 1 2 3 4 5 6
## X-squared = 8.024, df = 1, p-value = 0.004617

Cohort study

library("epicalc")
## Loading required package: foreign
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:ISwR':
## 
##     lung
## 
## Loading required package: MASS
## Loading required package: nnet
csi(17, 122, 31, 109)
## 
##           Exposure
## Outcome    Non-exposed Exposed Total
##   Negative 109         122     231  
##   Positive 31          17      48   
##   Total    140         139     279  
##                                     
##            Rne         Re      Rt   
##   Risk     0.22        0.12    0.17 
##                                          Estimate Lower95ci Upper95ci
##  Risk difference (Re - Rne)              -0.1     -0.19     -0.01    
##  Risk ratio                              0.55     0.31      0.98     
##  Protective efficacy =(Rne-Re)/Rne*100   44.8     2.18      69.08    
##    or percent of risk reduced                                        
##  Number needed to treat (NNT)            10.09    5.34      97.16    
##    or -1/(risk difference)
cci(484,385,27,90) 

plot of chunk unnamed-chunk-5

## 
##           Exposure
## Outcome    Non-exposed Exposed Total
##   Negative          90     385   475
##   Positive          27     484   511
##   Total            117     869   986
## 
## OR =  4.19 
## Exact 95% CI =  2.63, 6.84  
## Chi-squared = 43.95, 1 d.f., P value = 0
## Fisher's exact test (2-sided) P value = 0
csi (104,391,66,340)
## 
##           Exposure
## Outcome    Non-exposed Exposed Total
##   Negative 340         391     731  
##   Positive 66          104     170  
##   Total    406         495     901  
##                                     
##            Rne         Re      Rt   
##   Risk     0.16        0.21    0.19 
## 
##                                          Estimate Lower95ci Upper95ci
##  Risk difference (attributable risk)     0.05     0         0.1      
##  Risk ratio                              1.29     0.97      1.72     
##  Attr. frac. exp. -- (Re-Rne)/Re         0.23                        
##  Attr. frac. pop. -- (Rt-Rne)/Rt*100 %   13.84                       
##  Number needed to harm (NNH)             21.04    -301.71   10.07    
##    or 1/(risk difference)

McNemar test

hivnet <-matrix(c(251,68,178,98),nrow=2, dimnames=list("baseline"=c("Incorrect","correct"),"6 month"=c("Incorrect","correct")))
mcnemar.test(hivnet)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  hivnet
## McNemar's chi-squared = 48.3, df = 1, p-value = 3.664e-12

Fisher’s exact test

TeaTasting <-
matrix(c(3, 1, 1, 3),
       nrow = 2,
       dimnames = list(Guess = c("Milk", "Tea"),
                       Truth = c("Milk", "Tea")))
fisher.test(TeaTasting, alternative = "greater")
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TeaTasting
## p-value = 0.2429
## alternative hypothesis: true odds ratio is greater than 1
## 95 percent confidence interval:
##  0.3136    Inf
## sample estimates:
## odds ratio 
##      6.408

Simpson’s paradox

require(graphics)
## Data aggregated over departments
apply(UCBAdmissions, c(1, 2), sum)
##           Gender
## Admit      Male Female
##   Admitted 1198    557
##   Rejected 1493   1278
mosaicplot(apply(UCBAdmissions, c(1, 2), sum),
           main = "Student admissions at UC Berkeley")

plot of chunk unnamed-chunk-8

## Data for individual departments
opar <- par(mfrow = c(2, 3), oma = c(0, 0, 2, 0))
for(i in 1:6)
  mosaicplot(UCBAdmissions[,,i],
    xlab = "Admit", ylab = "Sex",
    main = paste("Department", LETTERS[i]))
mtext(expression(bold("Student admissions at UC Berkeley")),
      outer = TRUE, cex = 1.5)

plot of chunk unnamed-chunk-8

par(opar)

Stratified analysis

data(Oswego)
use(Oswego)
cc(ill, chocolate)

plot of chunk unnamed-chunk-9

## 
##        chocolate
## ill     FALSE TRUE Total
##   FALSE     7   22    29
##   TRUE     20   25    45
##   Total    27   47    74
## 
## OR =  0.4 
## Exact 95% CI =  0.12, 1.23  
## Chi-squared = 3.14, 1 d.f., P value = 0.076
## Fisher's exact test (2-sided) P value = 0.089
mhor(ill, chocolate, sex)
## 
## Stratified analysis by  sex 
##                 OR lower lim. upper lim. P value
## sex F        0.417     0.0617       2.06  0.3137
## sex M        0.331     0.0512       1.83  0.2635
## M-H combined 0.364     0.1258       1.05  0.0611
## 
## M-H Chi2(1) = 3.51 , P value = 0.061 
## Homogeneity test, chi-squared 1 d.f. = 0.05 , P value = 0.827

plot of chunk unnamed-chunk-9

Classification

require(ISLR)
## Loading required package: ISLR
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
summary(Smarket)
##       Year           Lag1             Lag2             Lag3       
##  Min.   :2001   Min.   :-4.922   Min.   :-4.922   Min.   :-4.922  
##  1st Qu.:2002   1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:-0.640  
##  Median :2003   Median : 0.039   Median : 0.039   Median : 0.038  
##  Mean   :2003   Mean   : 0.004   Mean   : 0.004   Mean   : 0.002  
##  3rd Qu.:2004   3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.: 0.597  
##  Max.   :2005   Max.   : 5.733   Max.   : 5.733   Max.   : 5.733  
##       Lag4             Lag5            Volume          Today       
##  Min.   :-4.922   Min.   :-4.922   Min.   :0.356   Min.   :-4.922  
##  1st Qu.:-0.640   1st Qu.:-0.640   1st Qu.:1.257   1st Qu.:-0.640  
##  Median : 0.038   Median : 0.038   Median :1.423   Median : 0.038  
##  Mean   : 0.002   Mean   : 0.006   Mean   :1.478   Mean   : 0.003  
##  3rd Qu.: 0.597   3rd Qu.: 0.597   3rd Qu.:1.642   3rd Qu.: 0.597  
##  Max.   : 5.733   Max.   : 5.733   Max.   :3.152   Max.   : 5.733  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 
pairs(Smarket,col=Smarket$Direction)

plot of chunk unnamed-chunk-10

Logistic regression

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -1.45   -1.20    1.07    1.15    1.33  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12600    0.24074   -0.52     0.60
## Lag1        -0.07307    0.05017   -1.46     0.15
## Lag2        -0.04230    0.05009   -0.84     0.40
## Lag3         0.01109    0.04994    0.22     0.82
## Lag4         0.00936    0.04997    0.19     0.85
## Lag5         0.01031    0.04951    0.21     0.83
## Volume       0.13544    0.15836    0.86     0.39
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1742
## 
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response") 
glm.probs[1:5]
##      1      2      3      4      5 
## 0.5071 0.4815 0.4811 0.5152 0.5108
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
mean(glm.pred==Direction)
## [1] 0.5216

Make training and test set

train = Year<2005
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred==Direction.2005)
## [1] 0.4802

Fit smaller model

glm.fit=glm(Direction~Lag1+Lag2,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595
106/(76+106)
## [1] 0.5824