Categorical Data Analysis notes

##Week 1

Significance test about a binomial proportion Example: 2016 General Social Survey

prop.test(x=837,n=1810, p=0.5, alternative ="two.sided" ,correct = FALSE) #"less", "greater

## 
##  1-sample proportions test without continuity correction
## 
## data:  837 out of 1810, null probability 0.5
## X-squared = 10.219, df = 1, p-value = 0.00139
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.4395653 0.4854557
## sample estimates:
##         p 
## 0.4624309

#we could add "conf.level = 0.95"

#confidence interval

Example, for n=10 and y=9, construct a 95% confidence interval using * Wilson’s method * The traditional methods

library(PropCIs)
scoreci(x=9, n=10, conf.level = 0.95) #Wilson's confidence interval

## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  0.5958 0.9821

Agresti-Coull

##Week 2

#Small sample binomial inference Video 2 Suppose Ho: pi=0.5 vs Ha: pi>0.5 y=9, n=10

library(PropCIs)
library(stats)
#It can be expressed as the sum of those two terms
dbinom(x=9,size=10,prob=0.5)+ dbinom(10,10,0.5)

## [1] 0.01074219

dbinom(x=0,size=30,prob=.30)+ dbinom(x=1,size=30,prob=.30)

## [1] 0.0003123309

dnorm(9,6.3,)

## [1] 0.01042093

#Small-sample discrete Inference is conservative Ho:pi=0.5 vs Ha: pi is not equal to 0.5 n=25

This is a two sided hypothesis so we need a two sided p-value

#If we observe y=7
2*pbinom(7,25,0.5)

## [1] 0.04328525

#If we observe y=8
2*pbinom(8,25,0.5)

## [1] 0.1077521

Properties fo Mid P-values (slide 39)

#py represents the pmf
#mpval is the mid p-value

#sum(py*mpval) #By doing this calculation, we are calculating the expectation of the mid p-value and it's equal to 0.5

Video 3 The posterior 95% equal-tail credible interval alpha* = y + alpha= 837.5 beta* = n-y+beta = 973.5

library(stats)

round(qbeta(0.25, 837.5, 973.5), 3)#0.25 quartile gives the upper limit

## [1] 0.455

round(qbeta(0.975, 837.5, 973.5), 3)#0.75 gives the lower limit

## [1] 0.485

##Week 3
Video 2 Significance test for comparing two proportions

Example: Success = having heart attack p1=189/11034 (having heart attack in the placebo group) p2=104/11037 (having heart attack in the aspirin group)

prop.test(c(189,104), c(11034,11037), alternative = "two.sided", correct=FALSE)

## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(189, 104) out of c(11034, 11037)
## X-squared = 25.014, df = 1, p-value = 5.692e-07
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.004687751 0.010724297
## sample estimates:
##     prop 1     prop 2 
## 0.01712887 0.00942285

Using another package.It uses binomial success counts and n for each group

library(PropCIs)

diffscoreci(189,11034, 104, 11037, conf.level = 0.95)

## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  0.004716821 0.010788501

Video3 Relative risk

library(PropCIs)
riskscoreci(189,11034, 104, 11037,0.95)

## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  1.433904 2.304713

#The risk of MI is at least 43% higher for the placebo group by looking at the lower limit f the CI

library(epitools)
oddsratio(c(189,11034, 104, 11037), method = "wald",
          conf=0.95,correct=FALSE)

## $data
##           Outcome
## Predictor  Disease1 Disease2 Total
##   Exposed1      189    11034 11223
##   Exposed2      104    11037 11141
##   Total         293    22071 22364
## 
## $measure
##           odds ratio with 95% C.I.
## Predictor  estimate    lower    upper
##   Exposed1 1.000000       NA       NA
##   Exposed2 1.817802 1.428868 2.312603
## 
## $p.value
##           two-sided
## Predictor    midp.exact fisher.exact   chi.square
##   Exposed1           NA           NA           NA
##   Exposed2 7.003039e-07 8.978571e-07 7.996161e-07
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

# a95% CI is just the exp of the lower limit and exp of the upper limit
#this confidence interval doesn't contain 1, so the true odds of MI are different for the two groups
#The odds of MI for the placebo group are at least 44% higher for the aspirin group

For small samples, we use the score confidence interval

library(PropCIs)
orscoreci(189, 11034,104,11037, 0.95)

## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  1.440802 2.329551

##Week 4 *Video 1

Example: Gender gap inpoliical affiliation

Political<-read.table("http://www.stat.ufl.edu/~aa/cat/data/Political.dat", header = TRUE)
head(Political)

##   person gender party
## 1      1 female   Dem
## 2      2 female   Dem
## 3      3 female   Dem
## 4      4 female   Dem
## 5      5 female   Dem
## 6      6 female   Dem

#Specify preferred order
Party<-factor(Political$party,levels = c("Dem", "Rep", "Ind"))
#categories for displays
GenderGap<-xtabs(~gender+Party, data = Political)
#forms contingency table
GenderGap

##         Party
## gender   Dem Rep Ind
##   female 495 272 590
##   male   330 265 498

margin.table(GenderGap,1) #gender

## gender
## female   male 
##   1357   1093

margin.table(GenderGap,2) #party affiliation

## Party
##  Dem  Rep  Ind 
##  825  537 1088

summary(GenderGap) #this summary would give us the test of independence

## Call: xtabs(formula = ~gender + Party, data = Political)
## Number of cases in table: 2450 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 12.569, df = 2, p-value = 0.001865

Another way of doing the same example If we just have he total counts

F<-c(495,272,590)
M<-c(330,265,498)
gender.gap<-as.data.frame(rbind(F,M))
names(gender.gap)<-c("Dem", "Rep", "Ind")
gender.gap

##   Dem Rep Ind
## F 495 272 590
## M 330 265 498

fit<-chisq.test(gender.gap); fit

## 
##  Pearson's Chi-squared test
## 
## data:  gender.gap
## X-squared = 12.569, df = 2, p-value = 0.001865

round(fit$expected,1) #expected values for each cell

##     Dem   Rep   Ind
## F 456.9 297.4 602.6
## M 368.1 239.6 485.4

round(fit$stdres,2) #Standardized residuals

##     Dem  Rep   Ind
## F  3.27 -2.5 -1.03
## M -3.27  2.5  1.03

Video 3

#2x3 table  summing Dem+Rep
gender.gap1<-gender.gap[,-c(3)]
gender.gap1

##   Dem Rep
## F 495 272
## M 330 265

'Dem+Rep'<-gender.gap$Dem + gender.gap$Rep
`Dem+Rep`

## [1] 767 595

gender.gap2<-as.data.frame(cbind(`Dem+Rep`,gender.gap$Ind)) #Combine by column
names(gender.gap2)<-c("Rep+Dem","Ind")
rownames(gender.gap2)<-c("F", "M")
gender.gap2

##   Rep+Dem Ind
## F     767 590
## M     595 498

library(DescTools)

GTest(gender.gap) #Full table

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  gender.gap
## G = 12.601, X-squared df = 2, p-value = 0.001835

#There is strong evidence to say there is difference b/t females and males 

GTest(gender.gap1) #table with republicans and democrats alone

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  gender.gap1
## G = 11.536, X-squared df = 1, p-value = 0.0006827

#G^2= 11.536

GTest(gender.gap2) #table with columns Dem+Rep, Ind

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  gender.gap2
## G = 1.0652, X-squared df = 1, p-value = 0.302

Example: Alcohol use and Infant Malformation Explanatory: alcohol consumption (ordinal)-ranges/reponse levels are a bit subjective Response: Malformation (nominal)

c1<-c(17066,48)
c2<-c(14464, 38)
c3<-c(788, 5 )
c4<-c(126, 1)
c5<-c(37, 1)
alc<-as.data.frame(rbind(c1, c2,c3,c4,c5))
names(alc)<-c("Absent", "Present") 
alc

##    Absent Present
## c1  17066      48
## c2  14464      38
## c3    788       5
## c4    126       1
## c5     37       1

col.sums<-colSums(alc)
row.sums<-rowSums(alc)
tot<-sum(col.sums)
row.probs<-row.sums/tot
col.probs<-col.sums/tot
                   
round(row.probs,3)

##    c1    c2    c3    c4    c5 
## 0.525 0.445 0.024 0.004 0.001

round(col.probs,3)

##  Absent Present 
##   0.997   0.003

u<-c(0, 0.5, 1.5, 4,7)
v<-c(0,1)

u.bar<-sum(row.probs*u)
v.bar<-sum(col.probs*v)

probs<-alc/tot

den<-sqrt(sum((u-u.bar)^2)*row.probs)*sum((v-v.bar)^2*col.probs)

num<-0
for (i in 1:5){
  for (j in 1:2){
    num<-num+(u[i]-u.bar)*(v[j]-v.bar)*probs[i,j]
  }
}
                   
num/den

##         c1         c2         c3         c4         c5 
## 0.02139913 0.02324652 0.09941118 0.24841052 0.45412984

round(1-pchisq(6.57,1),3) #strong evidence of a nonzero correlation

## [1] 0.01

We can do the sameby using pakage

library(vcdExtra)

## Loading required package: vcd

## Loading required package: grid

## 
## Attaching package: 'vcd'

## The following object is masked from 'package:epitools':
## 
##     oddsratio

## Loading required package: gnm

## 
## Attaching package: 'vcdExtra'

## The following object is masked from 'package:epitools':
## 
##     expand.table

Malform<-matrix(c(17066, 14464,788,126,37,48,38,5,1,1), ncol = 2)
CMHtest(Malform, rscores = c(0,0.5,1.5,4,7))

## Cochran-Mantel-Haenszel Statistics 
## 
##                  AltHypothesis   Chisq Df     Prob
## cor        Nonzero correlation  6.5699  1 0.010372
## rmeans  Row mean scores differ 12.0817  4 0.016754
## cmeans  Col mean scores differ  6.5699  1 0.010372
## general    General association 12.0817  4 0.016754

#Week 5 Video 1 Showed Fisher test ouput for the tea example but did not

Video 2 Tea example

library(epitools)
ormidp.test(3,1,1,3, or=1) #enter the four cell counts

##   one.sided two.sided
## 1 0.1285714 0.2571429

#We see the observed p-values for one and two-sided

library(exact2x2)

## Loading required package: exactci

## Loading required package: ssanv

## Loading required package: testthat

## 
## Attaching package: 'exactci'

## The following object is masked from 'package:epitools':
## 
##     binom.exact

#taster
#fisher.exact(taster)

#this is a useless confidence interval because it is too wide

library(PropCIs)
#posteriorintervval for odds ratio
orci.bayes(11,11,0,1,0.5, 0.5, 0.5, 0.5,0.95, nsim = 1e+06)

## [1] 3.276438e+00 1.361274e+06

#confidence interval for thedifferencebetween theproportions 
diffci.bayes(11,11,0,1,0.5, 0.5, 0.5, 0.5,0.95, nsim = 1e+06)

## [1] 0.09899729 0.99327276

library(stats)
pi1=rbeta(1e+8, 11.5, 0.5)
pi2=rbeta(1e+8, 0.5, 1.5)
or<-pi1*(1-pi2)/((1-pi1)*pi2) #Find odds ratio
quantile(or, c(0.25, 0.975))

##          25%        97.5% 
## 5.805198e+01 1.361927e+06

mean(pi1<pi2)

## [1] 0.0058584

dbinom(x=8,size=10,prob=0.5)+dbinom(x=9,size=10,prob=0.5)+ dbinom(10,10,0.5)

## [1] 0.0546875

x=(0.5*(dbinom(x=8,size=10,prob=0.5)))+ (1-(dbinom(x=8,size=10,prob=0.5)+dbinom(x=9,size=10,prob=0.5)+ dbinom(10,10,0.5)))
y=(0.5*(dbinom(x=8,size=10,prob=0.5)))+ ((dbinom(x=8,size=10,prob=0.5)+dbinom(x=9,size=10,prob=0.5)+ dbinom(10,10,0.5)))
y

## [1] 0.07666016

x+y

## [1] 1.043945

dbinom(x=9,size=10,prob=0.5)+ dbinom(10,10,0.5)

## [1] 0.01074219

Week 2, video 2

Week 3, video 3. ## ODDS RATIO large samples

library(epitools)
dat<-matrix(c(189, 10845, 104, 10933),2,2, byrow = TRUE)
dat

##      [,1]  [,2]
## [1,]  189 10845
## [2,]  104 10933

#odds
#rev = c("neither", "rows", "columns", "both")
#oddsratio(x=, y=, method = , conf.level=0.95, correction = FALSE)
#oddsratio(dat, method = "wald", conf.level=0.95, correction=FALSE)

#Interpretation
#This CI doesn't contain 1, thus the true odds of MI are different for the two groups
#The odds of MI for the placebo group are at least 44% HIGHER than for Aspiring group

ODDS RATIO small samples

#proportion 1 & proportion 2 are close to 0 or 1.
orscoreci(189, 11034, 104, 11037, 0.95)

## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  1.440802 2.329551

Week 4, video 1 ##Example Gendergap in political affiliation

Political<-read.table("http://www.stat.ufl.edu/~aa/cat/data/Political.dat", header=TRUE)
head(Political)

##   person gender party
## 1      1 female   Dem
## 2      2 female   Dem
## 3      3 female   Dem
## 4      4 female   Dem
## 5      5 female   Dem
## 6      6 female   Dem

Party<-factor(Political$party, levels=c("Dem", "Rep", "Ind")) #level specifies preferred order of categories for displays

GenderGap<-xtabs(~gender + Party, data=Political)
#forms of contengency table
GenderGap

##         Party
## gender   Dem Rep Ind
##   female 495 272 590
##   male   330 265 498

margin.table(GenderGap,1)

## gender
## female   male 
##   1357   1093

margin.table(GenderGap,2)

## Party
##  Dem  Rep  Ind 
##  825  537 1088

summary(GenderGap)

## Call: xtabs(formula = ~gender + Party, data = Political)
## Number of cases in table: 2450 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 12.569, df = 2, p-value = 0.001865

Another way of creating contengency tables

F<-c(495, 272, 590)
M<-c(330, 265, 498)
gender.gap<-as.data.frame(rbind(F,M))
gender.gap

##    V1  V2  V3
## F 495 272 590
## M 330 265 498

names(gender.gap)<-c("Dem", "Rep", "Ind")
fit<-chisq.test(gender.gap)
fit

## 
##  Pearson's Chi-squared test
## 
## data:  gender.gap
## X-squared = 12.569, df = 2, p-value = 0.001865

stdres <- chisq.test(GenderGap)$stdres # standardized residuals
stdres

##         Party
## gender         Dem       Rep       Ind
##   female  3.272365 -2.498557 -1.032199
##   male   -3.272365  2.498557  1.032199

library(vcd)
#mosaic(diag, gp=shading_Friendly, residuals=stdres, # mosaic plot
#+ residuals_type="Std\nresiduals", labeling=labeling_residuals)

Standardized residuals

round(fit$stdres,2)

##     Dem  Rep   Ind
## F  3.27 -2.5 -1.03
## M -3.27  2.5  1.03

#Week 4, video 2

library(DescTools)
GTest(gender.gap1)

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  gender.gap1
## G = 11.536, X-squared df = 1, p-value = 0.0006827

GTest(gender.gap2)

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  gender.gap2
## G = 1.0652, X-squared df = 1, p-value = 0.302

#week 5,video,5

#cases<-data.frame(victim=factor(c(rep("W",515),rep("B",159)),levels = c("W", "B")), Defendant=factor(c(rep("W",515),rep("B",159)),levels = c("W", "B"))

#ctable<-xtabs(~victim+Defendant+y,data = cases)

Categorical Data Analysis notes

Maria Aleida Ramirez

2/8/2022

ODDS RATIO small samples