Correspondence Analysis for 2 categorical variables (Race/Restaurants)

Analyzing the relationship between race and pizza for women for a market research firm

library(tidyr)
library(ca)

Read data

# import data
data <- read.csv(file = 'pizzafem truncated.csv')
head(data)
##      myid RESP_RACE pizza
## 1 2354858         1    NA
## 2 2354859         1    NA
## 3 2354861         4    NA
## 4 2354862         4     2
## 5 2354863         4     2
## 6 2354872         1    NA

Drop NA values

data.final=drop_na(data)

Contigency Table

data.final.matrix= table(data.final$RESP_RACE, data.final$pizza)
data.final.matrix <- as.matrix(data.final.matrix)
data.final.matrix
##    
##       1   2   3   4
##   1 560 508 238 476
##   2 103  76  30  67
##   3  27  33  13   7
##   4 100 165  32 139

Contigency Table with sum

contigency.table.raw= as.data.frame.matrix(table(data.final$RESP_RACE, data.final$pizza))
contigency.table.sum= as.data.frame.matrix(addmargins(table(data.final$RESP_RACE, data.final$pizza)))
RACE=c("1.00 WHITE","2.00 BLACK", "3.00 ASIAN","4.00 OTHER","SUM")
contigency.table.sum=cbind(contigency.table.sum, RACE)
contigency.table.sum=contigency.table.sum[,c(6,1,2,3,4,5)]
colnames(contigency.table.sum) <- c("RACE","1.00 pizza hut","2.00 dominos", "3.00 papa johns","4.00 little caesars","SUM")
contigency.table.sum
##           RACE 1.00 pizza hut 2.00 dominos 3.00 papa johns 4.00 little caesars
## 1   1.00 WHITE            560          508             238                 476
## 2   2.00 BLACK            103           76              30                  67
## 3   3.00 ASIAN             27           33              13                   7
## 4   4.00 OTHER            100          165              32                 139
## Sum        SUM            790          782             313                 689
##      SUM
## 1   1782
## 2    276
## 3     80
## 4    436
## Sum 2574

chi square test

chisq <- chisq.test(data.final.matrix)
chisq
## 
##  Pearson's Chi-squared test
## 
## data:  data.final.matrix
## X-squared = 53.459, df = 9, p-value = 2.392e-08

observations

chisq$observed
##    
##       1   2   3   4
##   1 560 508 238 476
##   2 103  76  30  67
##   3  27  33  13   7
##   4 100 165  32 139

expected table

round(chisq$expected,2)
##    
##          1      2      3      4
##   1 546.92 541.38 216.69 477.00
##   2  84.71  83.85  33.56  73.88
##   3  24.55  24.30   9.73  21.41
##   4 133.82 132.46  53.02 116.71

Calculate expected table manually

total.sum=marginSums(contigency.table.raw$`1`+contigency.table.raw$`2`+contigency.table.raw$`3`+contigency.table.raw$`4`)
exp.1.1=marginSums(contigency.table.raw$`1`)*marginSums(as.integer(contigency.table.raw[1,]))/total.sum
exp.1.2=marginSums(contigency.table.raw$`1`)*marginSums(as.integer(contigency.table.raw[2,]))/total.sum
exp.1.3=marginSums(contigency.table.raw$`1`)*marginSums(as.integer(contigency.table.raw[3,]))/total.sum
exp.1.4=marginSums(contigency.table.raw$`1`)*marginSums(as.integer(contigency.table.raw[4,]))/total.sum
exp.2.1=marginSums(contigency.table.raw$`2`)*marginSums(as.integer(contigency.table.raw[1,]))/total.sum
exp.2.2=marginSums(contigency.table.raw$`2`)*marginSums(as.integer(contigency.table.raw[2,]))/total.sum
exp.2.3=marginSums(contigency.table.raw$`2`)*marginSums(as.integer(contigency.table.raw[3,]))/total.sum
exp.2.4=marginSums(contigency.table.raw$`2`)*marginSums(as.integer(contigency.table.raw[4,]))/total.sum
exp.3.1=marginSums(contigency.table.raw$`3`)*marginSums(as.integer(contigency.table.raw[1,]))/total.sum
exp.3.2=marginSums(contigency.table.raw$`3`)*marginSums(as.integer(contigency.table.raw[2,]))/total.sum
exp.3.3=marginSums(contigency.table.raw$`3`)*marginSums(as.integer(contigency.table.raw[3,]))/total.sum
exp.3.4=marginSums(contigency.table.raw$`3`)*marginSums(as.integer(contigency.table.raw[4,]))/total.sum
exp.4.1=marginSums(contigency.table.raw$`4`)*marginSums(as.integer(contigency.table.raw[1,]))/total.sum
exp.4.2=marginSums(contigency.table.raw$`4`)*marginSums(as.integer(contigency.table.raw[2,]))/total.sum
exp.4.3=marginSums(contigency.table.raw$`4`)*marginSums(as.integer(contigency.table.raw[3,]))/total.sum
exp.4.4=marginSums(contigency.table.raw$`4`)*marginSums(as.integer(contigency.table.raw[4,]))/total.sum
exp.col.1=c(exp.1.1,exp.1.2,exp.1.3,exp.1.4)
exp.col.2=c(exp.2.1,exp.2.2,exp.2.3,exp.2.4)
exp.col.3=c(exp.3.1,exp.3.2,exp.3.3,exp.3.4)
exp.col.4=c(exp.4.1,exp.4.2,exp.4.3,exp.4.4)
expected.table.manual=data.frame(exp.col.1,exp.col.2,exp.col.3,exp.col.4)
expected.table.manual
##   exp.col.1 exp.col.2 exp.col.3 exp.col.4
## 1 546.92308 541.38462 216.69231 477.00000
## 2  84.70862  83.85082  33.56177  73.87879
## 3  24.55322  24.30458   9.72805  21.41414
## 4 133.81507 132.45998  53.01787 116.70707

chi square contribution table

chisq.contribution <- chisq$residuals^2
chisq.contribution=round(chisq.contribution, 3)
chisq.contribution=as.matrix(chisq.contribution)
chisq.contribution
##    
##         1     2     3     4
##   1 0.313 2.059 2.095 0.002
##   2 3.950 0.735 0.378 0.640
##   3 0.244 3.111 1.100 9.702
##   4 8.545 7.994 8.332 4.258

chi square contribution table with sum

chisq.contribution.sum= as.data.frame.matrix(addmargins(chisq.contribution))
RACE=c("1.00 WHITE","2.00 BLACK", "3.00 ASIAN","4.00 OTHER","SUM")
chisq.contribution.sum=cbind(chisq.contribution.sum, RACE)
chisq.contribution.sum=chisq.contribution.sum[,c(6,1,2,3,4,5)]
colnames(chisq.contribution.sum) <- c("RACE","1.00 pizza hut","2.00 dominos", "3.00 papa johns","4.00 little caesars","SUM")
chisq.contribution.sum
##           RACE 1.00 pizza hut 2.00 dominos 3.00 papa johns 4.00 little caesars
## 1   1.00 WHITE          0.313        2.059           2.095               0.002
## 2   2.00 BLACK          3.950        0.735           0.378               0.640
## 3   3.00 ASIAN          0.244        3.111           1.100               9.702
## 4   4.00 OTHER          8.545        7.994           8.332               4.258
## Sum        SUM         13.052       13.899          11.905              14.602
##        SUM
## 1    4.469
## 2    5.703
## 3   14.157
## 4   29.129
## Sum 53.458

Calculate 3 metrics

metrics.result <- ca(data.final.matrix)

# Get mass, inertia and quality metrics for rows
names <- metrics.result$rownames
mass <- metrics.result$rowmass
inertia <- metrics.result$rowinertia
quality <- metrics.result$rowdist

metrics.row.table <- data.frame(names,mass,inertia,quality)
metrics.row.table
##   names       mass     inertia    quality
## 1     1 0.69230769 0.001736074 0.05007657
## 2     2 0.10722611 0.002215712 0.14374950
## 3     3 0.03108003 0.005500239 0.42067824
## 4     4 0.16938617 0.011316724 0.25847669
# Get mass, inertia and quality metrics for columns
names <- metrics.result$colnames
mass <- metrics.result$colmass
inertia <- metrics.result$colinertia
quality <- metrics.result$coldist

metrics.col.table <- data.frame(names,mass,inertia,quality)
metrics.col.table
##   names      mass     inertia   quality
## 1     1 0.3069153 0.005070425 0.1285325
## 2     2 0.3038073 0.005399546 0.1333151
## 3     3 0.1216006 0.004625416 0.1950327
## 4     4 0.2676768 0.005673361 0.1455844
metrics.result
## 
##  Principal inertias (eigenvalues):
##            1        2        3       
## Value      0.014114 0.005141 0.001514
## Percentage 67.96%   24.75%   7.29%   
## 
## 
##  Rows:
##                 1         2         3         4
## Mass     0.692308  0.107226  0.031080  0.169386
## ChiDist  0.050077  0.143750  0.420678  0.258477
## Inertia  0.001736  0.002216  0.005500  0.011317
## Dim. 1  -0.343356 -0.793746 -1.389359  2.160742
## Dim. 2   0.310330  0.179879 -5.392331 -0.392816
## 
## 
##  Columns:
##                 1         2         3        4
## Mass     0.306915  0.303807  0.121601 0.267677
## ChiDist  0.128532  0.133315  0.195033 0.145584
## Inertia  0.005070  0.005400  0.004625 0.005673
## Dim. 1  -1.017265  0.817238 -1.464276 0.904032
## Dim. 2   0.131312 -1.274237 -0.152200 1.364812

Correspondence plot

plot(metrics.result)

## Correspondence plot with arrows

plot(metrics.result, mass = TRUE, contrib = "absolute",map =
   "rowgreen", arrows = c(TRUE, TRUE))

## Standardized Adjusted Residuals

std.residuals <- chisq$stdres
std.residuals <-as.matrix(std.residuals)
std.residuals
##    
##               1           2           3           4
##   1  1.21085174 -3.10006194  2.78426354 -0.09645641
##   2  2.52649621 -1.08749100 -0.69426756 -0.98975998
##   3  0.60256339  2.14752172  1.13711356 -3.69779717
##   4 -3.85269907  3.71801464 -3.37934080  2.64586432