# used packages
library(tidyverse) # base tidy data tools
library(tidytext) # text mining; gets along with tidyverse
library(janeaustenr) # Jane Austen's novels
library(FactoMineR) # Multivariate Statistics methodsCorrespondence Notes
# austen_books(): function where you can find the texts of all six novels in a single data frameDetecting Associations with Correspondence Analysis (CA)
Say we are interested in studying the use of punctuation symbols across all Austen’s novels:
- commas: “,”
- semicolons: “;”
- colons: “:”
- quotations: ‘\“’
- apostrophes: “’”
- question marks: “?”
- exclamation symbols: “!”
- dashes (pairs): “–”
str_count() will count the frequencies of these types of symbols, and then get the total sum for each book.
crosstable = austen_books() |>
mutate(
commas = str_count(text, ","),
colons = str_count(text, ":"),
semicolons = str_count(text, ";"),
quotes = str_count(text, '\\"'),
apostrophes = str_count(text, "'"),
questions = str_count(text, "\\?"),
exclamations = str_count(text, "\\!"),
dashes = str_count(text, "--")
) |>
group_by(book) |>
summarise(
commas = sum(commas),
colons = sum(colons),
semis = sum(semicolons),
quotes = sum(quotes),
aposts = sum(apostrophes),
quests = sum(questions),
bangs = sum(exclamations),
dashes = sum(dashes)
)
crosstable# A tibble: 6 × 9
book commas colons semis quotes aposts quests bangs dashes
<fct> <int> <int> <int> <int> <int> <int> <int> <int>
1 Sense & Sensibility 9900 66 1572 3084 914 451 561 1178
2 Pride & Prejudice 9132 132 1538 3531 741 462 499 395
3 Mansfield Park 12439 339 2260 3292 1135 471 496 413
4 Emma 12020 174 2353 4189 1226 621 1063 3100
5 Northanger Abbey 6085 83 1172 2151 545 392 433 419
6 Persuasion 7025 130 1320 1565 582 217 318 142
This is an example of a cross-table (aka 2-way table or contingency table). It contains counts (non-negative). This table is the result of crossing the categories of 2 qualitative (categorical) variables.
Correspondence Analysis was developed to analyze contingency tables in which a sample of of observations is described by two nominal variables, but it was rapidly extended to the analysis of any data table with non-negative entries.
On a side note, we should mention that CA was often discovered (and rediscovered), and so variations of CA can be found under several different names such as “dual scaling,” “optimal scaling,” “homogeneity analysis,” or “reciprocal averaging.” The multiple identities of correspondence analysis are a consequence of its large number of properties, that answer a lot of apparently different problems.
Table of Relative Frequencies
In order to explain Correspondence Analysis, and also to simplify some of the computations on the data in the crosstable, it’s better if we reformat this object as a matrix:
# cross-table in matrix format
X = as.matrix(crosstable[,-1])
rownames(X) = str_extract(crosstable$book, "\\w+")
X commas colons semis quotes aposts quests bangs dashes
Sense 9900 66 1572 3084 914 451 561 1178
Pride 9132 132 1538 3531 741 462 499 395
Mansfield 12439 339 2260 3292 1135 471 496 413
Emma 12020 174 2353 4189 1226 621 1063 3100
Northanger 6085 83 1172 2151 545 392 433 419
Persuasion 7025 130 1320 1565 582 217 318 142
Relative Frequencies or Probabilities
The first step involves converting the frequencies or counts in X into relative frequencies (i.e. proportions) by dividing the cells in X over the total count of punctuation symbols:
Xprobs = X / sum(X)
round(Xprobs, 4) commas colons semis quotes aposts quests bangs dashes
Sense 0.0967 0.0006 0.0154 0.0301 0.0089 0.0044 0.0055 0.0115
Pride 0.0892 0.0013 0.0150 0.0345 0.0072 0.0045 0.0049 0.0039
Mansfield 0.1216 0.0033 0.0221 0.0322 0.0111 0.0046 0.0048 0.0040
Emma 0.1175 0.0017 0.0230 0.0409 0.0120 0.0061 0.0104 0.0303
Northanger 0.0595 0.0008 0.0115 0.0210 0.0053 0.0038 0.0042 0.0041
Persuasion 0.0687 0.0013 0.0129 0.0153 0.0057 0.0021 0.0031 0.0014
Think of the proportions in Xprobs as joint probabilities.
Marginal Probabilities
Column Margin is given by the sum of all entries in each column, column by column. These marginal probabilities can be easily obtain with colSums()
# column margin
col_margin = colSums(Xprobs)
round(col_margin, 4)commas colons semis quotes aposts quests bangs dashes
0.5531 0.0090 0.0998 0.1741 0.0503 0.0255 0.0329 0.0552
Row Margin is given by the sum of all the entires in each row, row by row. These marginal probabilities can be easily obtain with rowSums()
# row margin
row_margin = rowSums(Xprobs)
round(row_margin, 4) Sense Pride Mansfield Emma Northanger Persuasion
0.1732 0.1606 0.2037 0.2418 0.1102 0.1104
Independence Model
From basic probability rules, we know that if two events A and B are independent, their joint probability P(A and B) is given by the product of the marginal probabilities, that is:
\(P(A and B) = P(A) * P(B)\)
Relative Frequencies under Independence
What would the joint probabilities look like if there was no association between the books and the punctuation symbols?
# under independence
Xindep = row_margin %o% col_margin
round(Xindep, 4) commas colons semis quotes aposts quests bangs dashes
Sense 0.0958 0.0016 0.0173 0.0302 0.0087 0.0044 0.0057 0.0096
Pride 0.0888 0.0014 0.0160 0.0279 0.0081 0.0041 0.0053 0.0089
Mansfield 0.1127 0.0018 0.0203 0.0355 0.0102 0.0052 0.0067 0.0112
Emma 0.1338 0.0022 0.0241 0.0421 0.0122 0.0062 0.0080 0.0133
Northanger 0.0610 0.0010 0.0110 0.0192 0.0055 0.0028 0.0036 0.0061
Persuasion 0.0611 0.0010 0.0110 0.0192 0.0055 0.0028 0.0036 0.0061
To visualize the joint probability distributions under the assumption of independence, make a mosaicplot like the following one.
mosaicplot(t(Xindep), las = 1, border = NA, main = "Independence Model")The more a given joint probability departures from the expected probability under independence, the stronger the association will be between the “events.” This is one of the core ideas behind Correspondence Analysis.
Row Analysis
# row profiles (i.e. conditional probabilities on books)
row_profiles = sweep(Xprobs, MARGIN = 1, STATS = row_margin, FUN = "/")
round(row_profiles, 4) commas colons semis quotes aposts quests bangs dashes
Sense 0.5585 0.0037 0.0887 0.1740 0.0516 0.0254 0.0316 0.0665
Pride 0.5558 0.0080 0.0936 0.2149 0.0451 0.0281 0.0304 0.0240
Mansfield 0.5967 0.0163 0.1084 0.1579 0.0544 0.0226 0.0238 0.0198
Emma 0.4857 0.0070 0.0951 0.1693 0.0495 0.0251 0.0430 0.1253
Northanger 0.5395 0.0074 0.1039 0.1907 0.0483 0.0348 0.0384 0.0371
Persuasion 0.6217 0.0115 0.1168 0.1385 0.0515 0.0192 0.0281 0.0126
The entries in this table are basically conditional probabilities. A property of this table is that its row-sums are equal to 1:
rowSums(row_profiles) Sense Pride Mansfield Emma Northanger Persuasion
1 1 1 1 1 1
Average Book profile: We can take into account the average book profile given by the marginal probabilities in col_margin
Rows = rbind(row_profiles, average = col_margin)
round(Rows, 4) commas colons semis quotes aposts quests bangs dashes
Sense 0.5585 0.0037 0.0887 0.1740 0.0516 0.0254 0.0316 0.0665
Pride 0.5558 0.0080 0.0936 0.2149 0.0451 0.0281 0.0304 0.0240
Mansfield 0.5967 0.0163 0.1084 0.1579 0.0544 0.0226 0.0238 0.0198
Emma 0.4857 0.0070 0.0951 0.1693 0.0495 0.0251 0.0430 0.1253
Northanger 0.5395 0.0074 0.1039 0.1907 0.0483 0.0348 0.0384 0.0371
Persuasion 0.6217 0.0115 0.1168 0.1385 0.0515 0.0192 0.0281 0.0126
average 0.5531 0.0090 0.0998 0.1741 0.0503 0.0255 0.0329 0.0552
One way to visualize departure from independence is with a mosaicplot of the above table of conditional probabilities or row profiles (see below). Notice that we are also including the marginal probabilities of col_margin which is the average book profile.
mosaicplot(
t(Rows),
las = 1,
border = NA,
col = rainbow(ncol(Rows)),
main = "Row Profiles")From this picture, we can immediately tell that:
Commas “,” are the most used punctuation symbol; also the use of commas seems to be evenly distributed across all books.
In contrast, colons “:” are the least used symbol; notice that Mansfield has the largest proportion of colons.
Semicolons “;” also seem to be evenly distributed across all books, although not as much as commas.
Dashes “–” exhibit the largest amount of variability across different books; Persuasion being the book with the least amount of dashes, whereas Emma is the novel with most dashes (compared to other books).
Column Analysis
# column profiles (i.e. conditional probabilities on symbols)
col_profiles = sweep(Xprobs, MARGIN = 2, STATS = col_margin, FUN = "/")
round(col_profiles, 4) commas colons semis quotes aposts quests bangs dashes
Sense 0.1749 0.0714 0.1539 0.1731 0.1777 0.1725 0.1665 0.2086
Pride 0.1613 0.1429 0.1506 0.1982 0.1441 0.1767 0.1481 0.0699
Mansfield 0.2198 0.3669 0.2212 0.1848 0.2207 0.1802 0.1472 0.0731
Emma 0.2124 0.1883 0.2303 0.2352 0.2384 0.2376 0.3154 0.5490
Northanger 0.1075 0.0898 0.1147 0.1208 0.1060 0.1500 0.1285 0.0742
Persuasion 0.1241 0.1407 0.1292 0.0879 0.1132 0.0830 0.0944 0.0251
Average Symbol profile: We can take into account the average symbol profile given by the marginal probabilities in row_margin
# adding average column profile
Cols = cbind(col_profiles, average = row_margin)
round(Cols, 4) commas colons semis quotes aposts quests bangs dashes average
Sense 0.1749 0.0714 0.1539 0.1731 0.1777 0.1725 0.1665 0.2086 0.1732
Pride 0.1613 0.1429 0.1506 0.1982 0.1441 0.1767 0.1481 0.0699 0.1606
Mansfield 0.2198 0.3669 0.2212 0.1848 0.2207 0.1802 0.1472 0.0731 0.2037
Emma 0.2124 0.1883 0.2303 0.2352 0.2384 0.2376 0.3154 0.5490 0.2418
Northanger 0.1075 0.0898 0.1147 0.1208 0.1060 0.1500 0.1285 0.0742 0.1102
Persuasion 0.1241 0.1407 0.1292 0.0879 0.1132 0.0830 0.0944 0.0251 0.1104
Similarly, we can get a mosaicplot to visualize the distribution of these conditional probabilities. Notice that we are also including the marginal probabilities of row_margin which is the average symbol profile.
mosaicplot(
t(Cols),
las = 1,
border = NA,
col = rainbow(nrow(Cols)),
main = "Column Profiles")From this mosaicplot we can see that:
Commas and semicolons have similar distributions.
Similarly, quotes and question marks also have similar distributions.
Commas have the closest distribution to the average profile.
Compared to average profile, colons and dashes are the symbols that deviate the most from the average profile.
As we have previously detected, Emma is the book that has the largest proportion of dashes; in contrast Persuasion has the least proportion of dashes.
Also, Mansfield has the largest proportion of colons.
Simultaneous Representation of Rows and Colums
From an exploratory data analysis standpoint, we can use Correspondence Analysis to obtain a “map” (or scatterplot) to visually represent the categories behind the crosstable of frequencies, like in the following graphic:
austen_ca1 = CA(X)By default, CA() produces its own plot (see figure above). You can turn off this behavior with the argument graph = FALSE
# no default plot
austen_ca1 = CA(X, graph = FALSE)In order to use ggplot(), a bit of data manipulation is necessary:
# table with row and column coordinates (i.e. factor scores)
ca_dat = data.frame(
rbind(austen_ca1$row$coord[ ,1:2],
austen_ca1$col$coord[ ,1:2]))
# type of book or symbol
ca_dat$type = c(rep("book", nrow(austen_ca1$row$coord)),
rep("symbol", nrow(austen_ca1$col$coord)))
# correspondence analysis scatterplot
ggplot(ca_dat, aes(x = Dim.1, y = Dim.2, color = type)) +
geom_hline(yintercept = 0, col = "gray60") +
geom_vline(xintercept = 0, col = "gray60") +
geom_point() +
geom_text(label = rownames(ca_dat), alpha = 0.8) +
scale_x_continuous(limits = c(-0.4, 0.8)) +
labs(title = "Correspondence Analysis map",
x = sprintf("Dim-1 (%0.2f%s)", austen_ca1$eig[1,2], "%"),
y = sprintf("Dim-2 (%0.2f%s)", austen_ca1$eig[2,2], "%"))What does CA do?: Without going down the technical rabbit hole behind CA, it can be said that CA transforms a data table into two sets of new variables called factor scores: one set for the rows, and one set for the columns. These factor scores give the best representation of the similarity structure of, respectively, the rows and the columns of the table.
In the above map, rows and columns are represented as points whose coordinates are the factor scores and where the dimensions are also called factors, or simply dimensions. Interestingly, the factor scores of the rows and the columns have the same variance and, therefore, the rows and columns can be conveniently represented in one single map.
How to Interpret Point Proximity: Because of the way in which the coordinates are obtained to produce the above CA map, we get a nice interpretation of the displayed data. When two row points (or two column points, respectively) are close to each other, this means that these points have similar profiles, and therefore they will be located exactly at the same place.
What about the proximity between row and column points? It turns out that we can comment on the position of a row with respect to the positions of all of the columns but keeping in mind that it is impossible to draw conclusions about the distance between a specific row and a specific column.
The first dimension opposes the categories dashes and colons. This opposition on the graph is associated to the book Emma which has the largest proportion of dashes compared to the rest of the books. The table below shows the relative frequencies between a given book and a given symbol. Notice the cell of Emma and dashes, which has the largest proportion. In contrast, colons are more present in Mansfield than in any other book.
The book category Sense is extremely close to the origin of the graphic, thus indicating a profile near to the average book profile. Likewise the symbol category commas is the closest to the origin, signaling that this symbol is also close to the use of the average symbol profile.
Another Example of CA
# We’ll keep using all the books by Jane Austen. As usual, we begin by tokenizing the texts, then we remove stop-words, and after that we merge—via inner_join()—sentiments from Bing’s lexicon:
word_sentims = austen_books() |>
unnest_tokens(output = word, input = text) |>
anti_join(stop_words, by = "word") |>
inner_join(sentiments,
by = "word",
relationship = "many-to-many") |>
count(book, word, name = "count", sort = TRUE)
head(word_sentims, 10)# A tibble: 10 × 3
book word count
<fct> <chr> <int>
1 Emma miss 599
2 Mansfield Park miss 432
3 Pride & Prejudice miss 283
4 Sense & Sensibility miss 210
5 Northanger Abbey miss 206
6 Emma poor 136
7 Emma happy 125
8 Persuasion miss 125
9 Mansfield Park love 124
10 Mansfield Park happy 117
Notice that the most frequent word is miss which we know is more likely to refer to a lady or woman instead of the verb “to miss”. Therefore, we are going to remove it from word_sentims
word_sentims = word_sentims |>
filter(word != "miss")
head(word_sentims, 10)# A tibble: 10 × 3
book word count
<fct> <chr> <int>
1 Emma poor 136
2 Emma happy 125
3 Mansfield Park love 124
4 Mansfield Park happy 117
5 Emma love 117
6 Emma pleasure 115
7 Mansfield Park pleasure 101
8 Sense & Sensibility happy 100
9 Emma doubt 98
10 Mansfield Park poor 96
# The next step involves identifying, in a somewhat arbitrary way, words that have a “large” count, for example a count greater than or equal to 68:
selected_words = word_sentims |>
filter(count >= 68) |>
distinct(word) |>
pull()
selected_words[1] "poor" "happy" "love" "pleasure" "doubt" "happiness"
[7] "comfort" "affection" "pretty"
# With these selected_words, we filter them in from word_sentims to get a subset word_sentims2:
word_sentims2 = word_sentims |>
filter(word %in% selected_words)
head(word_sentims2, 10)# A tibble: 10 × 3
book word count
<fct> <chr> <int>
1 Emma poor 136
2 Emma happy 125
3 Mansfield Park love 124
4 Mansfield Park happy 117
5 Emma love 117
6 Emma pleasure 115
7 Mansfield Park pleasure 101
8 Sense & Sensibility happy 100
9 Emma doubt 98
10 Mansfield Park poor 96
# Having obtained the table word_sentims2, we then proceed to obtain the cross-table between books and the selected words
crosstable2 = word_sentims2 |>
select(book, word, count) |>
pivot_wider(
names_from = word,
values_from = count)
crosstable2# A tibble: 6 × 10
book poor happy love pleasure doubt happiness comfort affection pretty
<fct> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Emma 136 125 117 115 98 76 65 50 68
2 Mansfield… 96 117 124 101 46 86 83 52 56
3 Sense & S… 71 100 77 67 46 66 63 79 36
4 Pride & P… 38 83 92 92 37 72 31 58 24
5 Persuasion 55 64 42 39 26 32 21 9 28
6 Northange… 28 45 43 48 28 37 29 24 30
# To apply correspondence analysis, we convert crosstable into a matrix Y. In case we had entries with missing values, we play safe and artificially replace NA with 0.
Y = as.matrix(crosstable2[,-1])
Y[is.na(Y)] = 0
rownames(Y) = str_extract(crosstable2$book, "\\w+")
Y poor happy love pleasure doubt happiness comfort affection pretty
Emma 136 125 117 115 98 76 65 50 68
Mansfield 96 117 124 101 46 86 83 52 56
Sense 71 100 77 67 46 66 63 79 36
Pride 38 83 92 92 37 72 31 58 24
Persuasion 55 64 42 39 26 32 21 9 28
Northanger 28 45 43 48 28 37 29 24 30
# The last step involves passing Y to CA() to obtain the necessary outputs and produce the correspondence analysis map with ggplot() and friends:
# Correspondence Analysis
austen_ca2 = CA(Y, graph = FALSE)
# table with row and column coordinates (i.e. factor scores)
ca_dat = data.frame(
rbind(austen_ca2$row$coord[ ,1:2],
austen_ca2$col$coord[ ,1:2]))
# type of book or word
ca_dat$type = c(rep("book", nrow(austen_ca2$row$coord)),
rep("word", nrow(austen_ca2$col$coord)))
# correspondence analysis scatterplot
ggplot(ca_dat, aes(x = Dim.1, y = Dim.2, color = type)) +
geom_hline(yintercept = 0, col = "gray60") +
geom_vline(xintercept = 0, col = "gray60") +
geom_point() +
geom_text(label = rownames(ca_dat), alpha = 0.8) +
scale_x_continuous(limits = c(-0.3, 0.4)) +
labs(title = "Correspondence Analysis map",
x = sprintf("Dim-1 (%0.2f%s)", austen_ca2$eig[1,2], "%"),
y = sprintf("Dim-2 (%0.2f%s)", austen_ca2$eig[2,2], "%"))Words With Positive Sentiments
positive_words = austen_books() |>
unnest_tokens(output = word, input = text) |>
anti_join(stop_words, by = "word") |>
count(book, word, name = "count") |>
inner_join(sentiments,
by = "word",
relationship = "many-to-many") |>
filter(sentiment == "positive") |>
arrange(desc(count))
head(positive_words, 10)# A tibble: 10 × 4
book word count sentiment
<fct> <chr> <int> <chr>
1 Emma happy 125 positive
2 Mansfield Park love 124 positive
3 Mansfield Park happy 117 positive
4 Emma love 117 positive
5 Emma pleasure 115 positive
6 Mansfield Park pleasure 101 positive
7 Sense & Sensibility happy 100 positive
8 Pride & Prejudice love 92 positive
9 Pride & Prejudice pleasure 92 positive
10 Mansfield Park happiness 86 positive
# We can identify words that have a count greater than 50 (you can choose another threshold):
selected_positive = positive_words |>
filter(count >= 50) |>
distinct(word) |>
pull()
selected_positive [1] "happy" "love" "pleasure" "happiness" "comfort" "affection"
[7] "pretty" "glad" "perfectly" "ready" "kindness" "assure"
[13] "superior" "fine" "agreeable" "satisfied" "regard"
# With these subset of positive words, we filter them in from positive_words and then obtain the cross-table between book categories and positive words:
crosstable3 = positive_words |>
filter(word %in% selected_positive) |>
select(book, word, count) |>
pivot_wider(
names_from = word,
values_from = count)
crosstable3# A tibble: 6 × 18
book happy love pleasure happiness comfort affection pretty glad perfectly
<fct> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 Emma 125 117 115 76 65 50 68 50 67
2 Mansf… 117 124 101 86 83 52 56 67 48
3 Sense… 100 77 67 66 63 79 36 44 43
4 Pride… 83 92 92 72 31 58 24 37 47
5 Persu… 64 42 39 32 21 9 28 33 43
6 North… 45 43 48 37 29 24 30 32 23
# ℹ 8 more variables: ready <int>, kindness <int>, assure <int>,
# superior <int>, fine <int>, agreeable <int>, satisfied <int>, regard <int>
# To pass the cross-table to CA(), we convert crosstable3 into a matrix Xpos:
Xpos = as.matrix(crosstable3[,-1])
Xpos[is.na(Xpos)] = 0
rownames(Xpos) = str_extract(crosstable3$book, "\\w+")
Xpos happy love pleasure happiness comfort affection pretty glad
Emma 125 117 115 76 65 50 68 50
Mansfield 117 124 101 86 83 52 56 67
Sense 100 77 67 66 63 79 36 44
Pride 83 92 92 72 31 58 24 37
Persuasion 64 42 39 32 21 9 28 33
Northanger 45 43 48 37 29 24 30 32
perfectly ready kindness assure superior fine agreeable satisfied
Emma 67 66 40 59 59 42 50 52
Mansfield 48 60 60 19 11 57 52 44
Sense 43 35 42 32 12 25 25 26
Pride 47 20 29 39 18 31 45 34
Persuasion 43 22 15 25 9 33 38 23
Northanger 23 23 14 18 5 28 32 17
regard
Emma 43
Mansfield 37
Sense 50
Pride 49
Persuasion 25
Northanger 10
# And finally we apply CA() to Xpos to get the CA map:
ca_pos = CA(Xpos, graph = FALSE)
# table with row and column coordinates (i.e. factor scores)
ca_dat = data.frame(
rbind(ca_pos$row$coord[ ,1:2],
ca_pos$col$coord[ ,1:2]))
# type of book or word
ca_dat$type = c(rep("book", nrow(ca_pos$row$coord)),
rep("word", nrow(ca_pos$col$coord)))
# correspondence analysis scatterplot
ggplot(ca_dat, aes(x = Dim.1, y = Dim.2, color = type)) +
geom_hline(yintercept = 0, col = "gray60") +
geom_vline(xintercept = 0, col = "gray60") +
geom_point() +
geom_text(label = rownames(ca_dat), alpha = 0.8) +
scale_x_continuous(limits = c(-0.4, 0.5)) +
labs(title = "Correspondence Analysis map",
subtitle = "Words with positive sentiments",
x = sprintf("Dim-1 (%0.2f%s)", ca_pos$eig[1,2], "%"),
y = sprintf("Dim-2 (%0.2f%s)", ca_pos$eig[2,2], "%"))