## Chi-Square Test for Independence
# How to conduct a chi-square test for independence. The test
# is applied when there are two categorical variables from a
# single population. It is used to determine whether there is
# a significant association between the two variables.
require(foreign)
## Loading required package: foreign
setwd("~/Desktop/R/GSS")
options(digets = 8)
GSData.df = read.dta("GSS7214_R4.DTA") # From my wd downloaded as (GSS7214_R4.DTA)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
save(GSData.df, file = "GSS2014.rda") # Save the data frame to save the read
# from now on
#load("GSS2014.rda")
GSS.sub = GSData.df[, c('sex', 'educ', 'cappun')] # Select columns of interest
str(GSS.sub) # inspect the data frame
## 'data.frame': 59599 obs. of 3 variables:
## $ sex : Factor w/ 2 levels "male","female": 2 1 2 2 2 1 1 1 2 2 ...
## $ educ : int 16 10 12 17 12 14 13 16 12 12 ...
## $ cappun: Factor w/ 5 levels "iap","favor",..: NA NA NA NA NA NA NA NA NA NA ...
## add a column which converts the education years to different factors
GSS.sub$educ.yrs[GSS.sub$educ > 16] = 'Grad'
GSS.sub$educ.yrs[GSS.sub$educ == 16] = 'Bachelors'
GSS.sub$educ.yrs[GSS.sub$educ < 16
& GSS.sub$educ > 12] = 'SomeCollege'
GSS.sub$educ.yrs[GSS.sub$educ == 12] = 'HS'
GSS.sub$educ.yrs[GSS.sub$educ < 12] = 'LeftHS'
newGSS = na.omit(GSS.sub) # remove observations containing NA
newGSS$cappun = droplevels(newGSS$cappun) # drop unused levels
newGSS$educ.yrs = ordered(newGSS$educ.yrs, # order the factors
levels = c("LeftHS", "HS", "SomeCollege", "Bachelors", "Grad"))
## Create a contengency two-way table showing how different education
## levels favor or oppose capital punishment
tbl = with(newGSS, table(cappun, educ.yrs))
tbl
## educ.yrs
## cappun LeftHS HS SomeCollege Bachelors Grad
## favor 7471 11516 8495 4273 3097
## oppose 3450 3477 3074 1862 1836
## Chi-square Test of Independence
x = chisq.test(tbl) # A low p-value (p < .01) suggests we sludent use
# this data to claim opinions about death penality
# are independent of education levels.
x
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 474.98, df = 4, p-value < 2.2e-16
## Now code to perform a permutation test for independence
Education = newGSS$educ.yrs
DeathPenality = newGSS$cappun
observed = x$statistic # observed chi square statistic
N = 10 ^ 4 - 1
result = numeric(N)
for (i in 1:N)
{
DeathPenalityPermuted = sample(DeathPenality)
result[i] = chisq.test(table(Education, DeathPenalityPermuted))$statistic
}
hist(result)
abline(v = observed, col = "blue")

print((sum(result >= observed) + 1) / (N + 1))
## [1] 1e-04
##
## Take a smaller sample
##
index = sample(length(newGSS[, 1]), 800, replace = F)
GSS.smaller = newGSS[index, ]
tbl = with(GSS.smaller, table(cappun, educ.yrs))
x = chisq.test(tbl)
tbl
## educ.yrs
## cappun LeftHS HS SomeCollege Bachelors Grad
## favor 122 212 133 75 37
## oppose 66 57 40 29 29
x
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 21.38, df = 4, p-value = 0.0002662
########
## Now code to perform a permutation test for independence for
# small sample.
Education = GSS.smaller$educ.yrs
DeathPenality = GSS.smaller$cappun
observed = x$statistic # observed chi square statistic
N = 10 ^ 4 - 1
result = numeric(N)
for (i in 1:N)
{
DeathPenalityPermuted = sample(DeathPenality)
result[i] = chisq.test(table(Education, DeathPenalityPermuted))$statistic
}
hist(result)
abline(v = observed, col = "blue")

print((sum(result >= observed) + 1) / (N + 1))
## [1] 5e-04