This short article aims to analyze and compare the New Year speeches made by Xi Jinping and Tsai Ing-wen.

First at all, we install the required libraries and “require” them into the system.

if (!require("quanteda")) install.packages("quanteda")
if (!require("plotly")) install.packages("plotly")

Then, let’s obtain the copy of Xi Jinping’s speech.The file is already uploaded to my GitHub in plain text format. The first five lines of his speech are displayed for checking.

con <- url("https://raw.githubusercontent.com/fukingwa/JMSC6116_public/master/xi2021.txt") # Establish a connection via url
xi <- readLines(con)  # Read line by line from the connection to a string array xi
close(con) # Remember to close the connection after use
xi[1:5] # List the first five lines of Xi's speech
## [1] "The year 2021 is arriving. From China's capital Beijing, I extend my New Year wishes to you all! "                                                                                                                                                                                                                     
## [2] "2020 was an extraordinary year. Facing the sudden coronavirus pandemic, we put people and their lives first to interpret the great love among humans. With solidarity and resilience, we wrote the epic of our fight against the pandemic. "                                                                           
## [3] "During the days when we addressed the hardships together, we saw the heroic spirit of marching straight to the frontlines, holding posts with tenacity, taking responsibility to get through thick and thin, sacrifices with bravery, and touching moments of helping each other. "                                    
## [4] "From medical workers to the people's army, from scientific researchers to community workers, from volunteers to those who built the projects, from seniors to youths born after the 1990s and 2000s, numerous people fulfilled their missions at the cost of their lives and protected humanity with sincere love. "   
## [5] "They pooled their drops of strength into tremendous power and built an iron wall to safeguard lives. Many figures marched ahead without hesitation, many relays were accomplished hand in hand, many scenes showed touching moments, all these vividly illustrate the great spirit of fighting against the pandemic.  "
xi <- paste(xi,collapse=" ") 

Next, we get Tsai’s speech.

con <- url("https://raw.githubusercontent.com/fukingwa/JMSC6116_public/master/tsai2021.txt")
tsai <- readLines(con)
close(con)
tsai <- paste(tsai,collapse=" ")

Then, we “clean” the texts by removing punctutions and English’s “stopwords” (“a”, “an”, “the” etc), create a R object called dfm_xt, which stands for “Document Feature Matrix for Xi-Tsai”, i.e. Two leaders (by rows) and terms used (by columns) See what it looks like. The number in the matrix stands for the term frequency (column) of that document (row).

corp_xt <- corpus(c(xi,tsai))
docnames(corp_xt) <- c("xi","tsai")

toks_xt <- tokens(corp_xt, remove_punct = TRUE)
toks_xt <- tokens_remove(toks_xt, pattern = stopwords("en", source = "marimo"))
dfm_xt <- dfm(toks_xt)
print(dfm_xt)
## Document-feature matrix of: 2 documents, 886 features (44.6% sparse).
##       features
## docs   2021 arriving china's capital beijing extend new wishes 2020
##   xi      2        1       2       1       1      1   6      1    3
##   tsai    2        0       0       0       1      0  11      0    6
##       features
## docs   extraordinary
##   xi               1
##   tsai             0
## [ reached max_nfeat ... 876 more features ]

Ok. So far so good. We now compare the highest frequency terms used in each speech.

Num_of_terms_shown <- 5
xi_freqterm <- featfreq(dfm_xt["xi",])
xi_barplot <- data.frame(name=names(xi_freqterm),y=xi_freqterm)
xi_barplot <- xi_barplot[order(xi_barplot$y,decreasing=TRUE),]
xi_barplot$name <- factor(xi_barplot$name, levels = xi_barplot$name)
xi_barplot <- xi_barplot[1:Num_of_terms_shown,]

tsai_freqterm <- featfreq(dfm_xt["tsai",])
tsai_barplot <- data.frame(name=names(tsai_freqterm),y=tsai_freqterm)
tsai_barplot <- tsai_barplot[order(tsai_barplot$y,decreasing=TRUE),]
tsai_barplot$name <- factor(tsai_barplot$name, levels = tsai_barplot$name)
tsai_barplot <- tsai_barplot[1:Num_of_terms_shown,]

p1 <- plot_ly(xi_barplot, x = ~name, y = ~y, type = 'bar', 
              text = ~y, textposition = 'auto', name = "Xi Jingping's Speech",
              marker = list(color = 'red',
                            line = list(color = 'red', width = 1.5)))
p1 <- layout(p1, title = "", xaxis = list(title = ""), yaxis = list(title = ""))

p2 <- plot_ly(tsai_barplot, x = ~name, y = ~y, type = 'bar', 
              text = ~y, textposition = 'auto', name = "Tsai Ing-wen's Speech",
              marker = list(color = 'green',
                            line = list(color = 'green', width = 1.5)))
p2 <- layout(p2, title = "Top 5 Terms Used in Xi Jingping/Tsai Ing-wen's Speech" , xaxis = list(title = ""), yaxis = list(title = ""))

p <- subplot(p1,p2,shareY=T)
p <- layout(p, showlegend = T)
p

Question: is term frequency a good way for comparison? If not, what is a better option? How can we make a better plot? Hints:

print(paste0("Total number of terms of Xi's speech (cleaned version):",sum(dfm_xt["xi",])))
## [1] "Total number of terms of Xi's speech (cleaned version):615"
print(paste0("Total number of terms of Tsai's speech (cleaned version):",sum(dfm_xt["tsai",])))
## [1] "Total number of terms of Tsai's speech (cleaned version):776"

Next, we create a wordcloud for each.

textplot_wordcloud(dfm_xt["xi",], random_order = FALSE, rotation = .25, min_count = 2, color = RColorBrewer::brewer.pal(8, "Dark2"))

textplot_wordcloud(dfm_xt["tsai",], random_order = FALSE, rotation = .25, min_count = 2, color = RColorBrewer::brewer.pal(8, "Dark2"))

Finally, we generate a comparison wordcloud, which compares the relative frequency with which a term was used in the two speeches. The terms in the upper half are those used more frequently in Xi’s speech (proportion to their font size), the lower half are those used more in Tsai’s speech.

textplot_wordcloud(dfm_xt,comparison = TRUE, min_count = 2)

We redo the same exercise but this time instead of treating each term as a token, we combine two consecutive terms in the text as a “token”, what we call “bigram.”

toks_xt <- tokens_ngrams(toks_xt)
dfm_xt <- dfm(toks_xt)
print(dfm_xt)
## Document-feature matrix of: 2 documents, 1,354 features (49.9% sparse).
##       features
## docs   2021_arriving arriving_china's china's_capital capital_beijing
##   xi               1                1               1               1
##   tsai             0                0               0               0
##       features
## docs   beijing_extend extend_new new_wishes wishes_2020 2020_extraordinary
##   xi                1          1          1           1                  1
##   tsai              0          0          0           0                  0
##       features
## docs   extraordinary_facing
##   xi                      1
##   tsai                    0
## [ reached max_nfeat ... 1,344 more features ]

Ok. Let’s do everytime the same again and see what we get.

Barplot (Bigram)

Num_of_terms_shown <- 5
xi_freqterm <- featfreq(dfm_xt["xi",])
xi_barplot <- data.frame(name=names(xi_freqterm),y=xi_freqterm)
xi_barplot <- xi_barplot[order(xi_barplot$y,decreasing=TRUE),]
xi_barplot$name <- factor(xi_barplot$name, levels = xi_barplot$name)
xi_barplot <- xi_barplot[1:Num_of_terms_shown,]

tsai_freqterm <- featfreq(dfm_xt["tsai",])
tsai_barplot <- data.frame(name=names(tsai_freqterm),y=tsai_freqterm)
tsai_barplot <- tsai_barplot[order(tsai_barplot$y,decreasing=TRUE),]
tsai_barplot$name <- factor(tsai_barplot$name, levels = tsai_barplot$name)
tsai_barplot <- tsai_barplot[1:Num_of_terms_shown,]

p1 <- plot_ly(xi_barplot, x = ~name, y = ~y, type = 'bar', 
              text = ~y, textposition = 'auto', name = "Xi Jingping's Speech",
              marker = list(color = 'red',
                            line = list(color = 'red', width = 1.5)))
p1 <- layout(p1, title = "", xaxis = list(title = ""), yaxis = list(title = ""))

p2 <- plot_ly(tsai_barplot, x = ~name, y = ~y, type = 'bar', 
              text = ~y, textposition = 'auto', name = "Tsai Ing-wen's Speech",
              marker = list(color = 'green',
                            line = list(color = 'green', width = 1.5)))
p2 <- layout(p2, title = "Top 5 Terms Used in Xi Jingping/Tsai Ing-wen's Speech" , xaxis = list(title = ""), yaxis = list(title = ""))

p <- subplot(p1,p2,shareY=T)
p <- layout(p, showlegend = T)
p

Wordcloud (Bigram)

textplot_wordcloud(dfm_xt["xi",], random_order = FALSE, rotation = .25, min_count = 1, color = RColorBrewer::brewer.pal(8, "Dark2"))

textplot_wordcloud(dfm_xt["tsai",], random_order = FALSE, rotation = .25, min_count = 1, color = RColorBrewer::brewer.pal(8, "Dark2"))

Comparison wordcloud (Bigram)

textplot_wordcloud(dfm_xt,comparison = TRUE, min_count = 1)