## Chi-Square Test for Independence

# How to conduct a chi-square test for independence. The test 
# is applied when there are two categorical variables from a 
# single population. It is used to determine whether there is 
# a significant association between the two variables.

require(foreign)
## Loading required package: foreign
setwd("~/Desktop/R/GSS")
options(digets = 8)
GSData.df = read.dta("GSS7214_R4.DTA") # From my wd downloaded as (GSS7214_R4.DTA)
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated

## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
save(GSData.df, file = "GSS2014.rda")  # Save the data frame to save the read
                                       # from now on
#load("GSS2014.rda")

GSS.sub = GSData.df[, c('sex', 'educ', 'cappun')] # Select columns of interest
str(GSS.sub)                        # inspect  the data frame
## 'data.frame':    59599 obs. of  3 variables:
##  $ sex   : Factor w/ 2 levels "male","female": 2 1 2 2 2 1 1 1 2 2 ...
##  $ educ  : int  16 10 12 17 12 14 13 16 12 12 ...
##  $ cappun: Factor w/ 5 levels "iap","favor",..: NA NA NA NA NA NA NA NA NA NA ...
## add a column which converts the education years to different factors
GSS.sub$educ.yrs[GSS.sub$educ > 16]  = 'Grad'
GSS.sub$educ.yrs[GSS.sub$educ == 16] = 'Bachelors'
GSS.sub$educ.yrs[GSS.sub$educ < 16 
                 & GSS.sub$educ > 12] = 'SomeCollege'
GSS.sub$educ.yrs[GSS.sub$educ == 12] = 'HS'
GSS.sub$educ.yrs[GSS.sub$educ < 12] = 'LeftHS'

newGSS = na.omit(GSS.sub)          # remove observations containing NA

newGSS$cappun = droplevels(newGSS$cappun) # drop unused levels
newGSS$educ.yrs = ordered(newGSS$educ.yrs, # order the factors
        levels = c("LeftHS", "HS", "SomeCollege", "Bachelors", "Grad"))

## Create a contengency two-way table showing how different education
## levels favor or oppose capital punishment
tbl = with(newGSS, table(cappun, educ.yrs))
tbl
##         educ.yrs
## cappun   LeftHS    HS SomeCollege Bachelors  Grad
##   favor    7471 11516        8495      4273  3097
##   oppose   3450  3477        3074      1862  1836
## Chi-square Test of Independence

x = chisq.test(tbl)    # A low p-value (p < .01) suggests we sludent use
#                     this data to claim opinions about death penality
#                     are independent of education levels.
x
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 474.98, df = 4, p-value < 2.2e-16
## Now code to perform a permutation test for independence
Education = newGSS$educ.yrs
DeathPenality = newGSS$cappun
observed = x$statistic        # observed chi square statistic

N = 10 ^ 4 - 1
result = numeric(N)
for (i in 1:N)
{
  DeathPenalityPermuted = sample(DeathPenality)
  result[i] = chisq.test(table(Education, DeathPenalityPermuted))$statistic
}

hist(result)
abline(v = observed, col = "blue")

print((sum(result >= observed) + 1) / (N + 1))
## [1] 1e-04
##
## Take a smaller sample
##
index = sample(length(newGSS[, 1]), 800, replace = F)
GSS.smaller = newGSS[index, ]
tbl = with(GSS.smaller, table(cappun, educ.yrs))
x = chisq.test(tbl)
tbl
##         educ.yrs
## cappun   LeftHS  HS SomeCollege Bachelors Grad
##   favor     122 212         133        75   37
##   oppose     66  57          40        29   29
x
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 21.38, df = 4, p-value = 0.0002662
########
## Now code to perform a permutation test for independence for 
#  small sample.
Education = GSS.smaller$educ.yrs
DeathPenality = GSS.smaller$cappun
observed = x$statistic        # observed chi square statistic

N = 10 ^ 4 - 1
result = numeric(N)
for (i in 1:N)
{
  DeathPenalityPermuted = sample(DeathPenality)
  result[i] = chisq.test(table(Education, DeathPenalityPermuted))$statistic
}

hist(result)
abline(v = observed, col = "blue")

print((sum(result >= observed) + 1) / (N + 1))
## [1] 5e-04