options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("knitr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4n/jrzd6fkx7mn9rgtyb7vz7c4c0000gn/T//RtmpQh0YbT/downloaded_packages

Using R to Analyze Text Data

Today, we will learn how to analyze text data collected from a platform such as Twitter, Facebook, or Bluesky. We will use the Bluesky data you collected last class.

First, import the data that you saved last class. If you missed this step, re-download it from Bluesky.

posts <- read.csv("aoc_posts.csv")

head(posts)$text
## [1] "AOC has raised over $90,000 in the past 24 hours for those impacted by the LA wildfires. \n\nThanks to everyone who has chipped in so far. If you're able to support, here's a link that goes 100% to organizations on the ground:\nsecure.actblue.com/donate/aoc-la"                                        
## [2] "Hmm can we workshop the name please? They’re going to be bullied by the skibidi kids"                                                                                                                                                                                                                        
## [3] "Recently took a peek at Twitter and the disinformation is getting substantially more dangerous. Someone could get killed.\n\nThe tactics of constantly repeating outright lies, targeting specific people, repurposing years-old footage and presenting them as new is getting to another level. It’s scary."
## [4] "AOC: “Oh, I don’t think we’re witnessing the START of an oligarchy. I think we are fully here.” 🇺🇸"                                                                                                                                                                                                          
## [5] "You know it’s bad when even Trump is pity posting through it 💀"                                                                                                                                                                                                                                             
## [6] "Tried my best. Sorry I couldn’t pull it through everyone - we live to fight another day. ♥️"

Searching for Strings

One of the most basic things that we might want to do when facing a corpus of text is determine whether or not they contain particular phrases or references to particular entities. For this, we can use the grep and grepl functions within R. For example, if we wanted to see what Tweets contain the term “Democrat” we can do the following:

grep("Democrat",
     posts$text,
     ignore.case = T)
## [1]  30 146 327

This supplies a vector of index values referring to Tweets which match the search criteria. If we used grepl instead, we get the following output:

grepl("Democrat",
      posts$text,
      ignore.case = T)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE

grep can be more useful for subsetting whereas grepl is more immediately useful for coding variables. For example, if we wanted to add a variable to our data which records whether or not a Tweet references “Democrats” we could do the following:

grepl("Democrat",
      posts$text,
      ignore.case = T) -> posts$democrat

Whereas if we wanted to subset down to those Tweets which contain the term “Democrat,” we could do the following:

grepl("Democrat",
      posts$text,
      ignore.case = T) %>% 
  posts[.,] -> posts_dem

head(posts_dem)$text
## [1] "Fun fact: there’s never been a Democratic director of the FBI in its entire history."                                                                                                                                                                               
## [2] "Honoring autocrats is not “subtlety” or “sophistication,” whatever the Biden admin may say. It sells out human rights advocates, dispirits our democratic allies, strengthens our enemies. Modi should be a pariah."                                                
## [3] "I also dislike the 2 party system fwiw. In New York, for example, I run as the nominee for two parties: Democratic and Working Families.\n\nNY has a fusion system, where all the votes for a person get added together even they’re cast on different party lines."

Data cleaning

Text data will often noise that can influence your data analysis, such as unwanted symbols and punctuation, stop words (e.g., “the”, “an”), or URLs. Let’s take a look at how to start cleaning that.

Let’s look at a sample tweet.

posts$text[3]
## [1] "Recently took a peek at Twitter and the disinformation is getting substantially more dangerous. Someone could get killed.\n\nThe tactics of constantly repeating outright lies, targeting specific people, repurposing years-old footage and presenting them as new is getting to another level. It’s scary."

First, let’s get rid of punctuations, and see how it changes our sample tweet above.

posts$text %>% 
  gsub("[[:punct:] ]+",' ',.) -> posts$text_clean

posts$text_clean[3]
## [1] "Recently took a peek at Twitter and the disinformation is getting substantially more dangerous Someone could get killed \n\nThe tactics of constantly repeating outright lies targeting specific people repurposing years old footage and presenting them as new is getting to another level It s scary "

The punctuation’s been removed, but we still see line endings ( n) that aren’t meaningful text data. We can remove that too.

posts$text_clean %>% 
  gsub('\\n','',.) -> posts$text_clean

posts$text_clean[3]
## [1] "Recently took a peek at Twitter and the disinformation is getting substantially more dangerous Someone could get killed The tactics of constantly repeating outright lies targeting specific people repurposing years old footage and presenting them as new is getting to another level It s scary "

Next, we can try to remove stop words. There’s a lot of them, but there are valuable dictionaries of them that we can leverage.

For example, we can use the package ‘stopwords’.

## also installing the dependency 'ISOcodes'
## 
## The downloaded binary packages are in
##  /var/folders/4n/jrzd6fkx7mn9rgtyb7vz7c4c0000gn/T//RtmpQh0YbT/downloaded_packages

We specify the language as English and see what the list looks like.

library(stopwords)
stop_words <- stopwords("en")

stop_words[1:10]
##  [1] "i"         "me"        "my"        "myself"    "we"        "our"      
##  [7] "ours"      "ourselves" "you"       "your"

Then, we exclude all stop words from our sample. For example, we can loop over the list of stop words and remove them from our corpus.

for(i in stop_words){
  posts$text_clean <- gsub(paste0("\\b",i,"\\b"), "", posts$text_clean)
}

posts$text_clean[3]
## [1] "Recently took  peek  Twitter   disinformation  getting substantially  dangerous Someone  get killed The tactics  constantly repeating outright lies targeting specific people repurposing years old footage  presenting   new  getting  another level It s scary "

A few more steps! We can remove (1) numbers, (2) remaining words with less than 3 letters, and (3) non-ASCII characters like emojis.

posts$text_clean %>% 
  gsub('[[:digit:]]+', '', .) %>% 
  gsub('\\b\\w{1,2}\\b','',.) %>% 
  gsub('[^\x01-\x7F]', '', .) -> posts$text_clean

Okay, now that we have a less noisy dataset, we can do some analysis on this text.

Looking into our text data

Now that we have some relatively clean text, we might want to compute summaries or visualize word frequencies. To do this, we need to form what is known as a document-term matrix where each row is a Tweet and each column is a term or word. For this we will use the tm package and create our first corpus:

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'tm'
## The following object is masked from 'package:stopwords':
## 
##     stopwords
corpus <- VCorpus(VectorSource(posts$text_clean))
corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 481

To access information on the corpus elements, you can treat it like a list!

corpus[[1]]$content  
## [1] "AOC  raised      past  hours   impacted    wildfires Thanks  everyone   chipped   far    able  support    link  goes   organizations   ground secure actblue com donate aoc "

Now let’s make the document term matrix:

dtm <- TermDocumentMatrix(corpus) 
matrix <- as.matrix(dtm) 
words <- sort(rowSums(matrix),decreasing=TRUE) 
df <- data.frame(word = names(words),freq=words)

head(df)
##          word freq
## people people   80
## can       can   75
## one       one   52
## like     like   47
## just     just   46
## lot       lot   41

Above, you can see a dataframe of words and their frequency in our corpus, ordered in descending count. Let’s look at a wordcloud to visually see which are most common.

#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
set.seed(1234) # for reproducibility 
wordcloud(words = df$word, freq = df$freq, min.freq = 1,           max.words=200, random.order=FALSE, rot.per=0.35,            colors=brewer.pal(8, "Dark2"))

Exercise

1a. Make sure you’ve downloaded your own posts, cleaned the data, and created a word cloud as shown above.

1b. Subset the content for posts that contain the word “Democrat”, and create a word cloud of them.

1c. Subset the content for posts that contain the word “Republican”, and create a word cloud of them. Do you notice notable differences in the content?

2a. Compare the repost counts for posts containing the word “Democrat” and the word “Republican”. What gets more interaction (likes, reposts, comments) on average? (Note: you can use other keywords for this analysis that interest you, such as specific political topics.)

2b. Visually plot your analysis in 2a.