Twitter Analysis

Extracting Tweets
- Retrieve tweets from Twitter
- Tweets Description
Text Cleaning
- Build corpus
Frequent Words
- Build Term Document Matrix
- Top Frequent Terms
Wordcloud
- Build Wordcloud

Extracting Tweets

Retrieve tweets from Twitter

# Load packages
library(rtweet)
library(tidyverse)
library(wordcloud)
library(tm)

# Twitter authentication
create_token(
  app             = "my_twitter_research_app",
  consumer_key    = consumer_key,
  consumer_secret = consumer_secret,
  access_token    = access_token,
  access_secret   = access_secret)

## <Token>
## <oauth_endpoint>
##  request:   https://api.twitter.com/oauth/request_token
##  authorize: https://api.twitter.com/oauth/authenticate
##  access:    https://api.twitter.com/oauth/access_token
## <oauth_app> my_twitter_research_app
##   key:    1KdnTxM6HKLJxnC5d1ZiMmKcf
##   secret: <hidden>
## <credentials> oauth_token, oauth_token_secret
## ---

# Retrieve tweets
tweets <- search_tweets("trump", n = 1000, tweet_mode="extended")

## Searching for tweets...

## Finished collecting tweets!

tweets <- distinct(tweets, text, .keep_all=TRUE)
tweet1 <- search_tweets("erdogan", n = 1000, tweet_mode="extended")

## Searching for tweets...
## Finished collecting tweets!

tweet1 <- distinct(tweet1, text, .keep_all=TRUE)

Tweets Description

## plot time series of tweets
par(mfrow=c(1,2))
ts_plot(tweets, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of trum Twitter statuses ",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

ts_plot(tweet1, "3 hours") +
  theme_minimal() +
  theme(plot.title = ggplot2::element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of erdogan Twitter statuses ",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

tail(tweets, 20)

tail(tweet1, 20)

Text Cleaning

library(tm)

Build corpus

# build a corpus, and specify the source to be character vectors 
myCorpus <- Corpus(VectorSource(tweets$text))
myCorpus1 <- Corpus(VectorSource(tweet1$text))
# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus1 <- tm_map(myCorpus1, content_transformer(tolower))
# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeURL))
# remove anything other than English letters or space 
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x) 
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
myCorpus1 <- tm_map(myCorpus1, content_transformer(removeNumPunct))
# remove stopwords
myStopwords <- c(setdiff(stopwords('english'), c("r", "big")), "use", "kak", "kakak", "yuk", "ya")
stopwords_id <- read.table('stopwords-id.txt', header = FALSE)
myStopwords <- c(myStopwords, as.matrix(stopwords_id$V1), "hi", "yg")
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus1 <- tm_map(myCorpus1, removeWords, myStopwords)
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
myCorpus1 <- tm_map(myCorpus1, stripWhitespace)
# keep a copy for stem completion later
myCorpusCopy <- myCorpus
myCorpusCopy1 <- myCorpus1

Frequent Words

Build Term Document Matrix

tdm <- TermDocumentMatrix(myCorpus, control = list(wordLengths = c(1, Inf)))
tdm1 <- TermDocumentMatrix(myCorpus1, control = list(wordLengths = c(1, Inf)))

tdm

## <<TermDocumentMatrix (terms: 4715, documents: 654)>>
## Non-/sparse entries: 11190/3072420
## Sparsity           : 100%
## Maximal term length: 31
## Weighting          : term frequency (tf)

tdm1

## <<TermDocumentMatrix (terms: 5617, documents: 631)>>
## Non-/sparse entries: 14367/3529960
## Sparsity           : 100%
## Maximal term length: 44
## Weighting          : term frequency (tf)

Top Frequent Terms

freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms1 <- findFreqTerms(tdm1, lowfreq = 20)

freq.terms[1:50]

##  [1] "president"       "trumps"          "macron"         
##  [4] "trump"           "de"              "realdonaldtrump"
##  [7] "doesnt"          "much"            "people"         
## [10] "rain"            "world"           "votes"          
## [13] "florida"         "fraud"           "election"       
## [16] "like"            "dont"            "will"           
## [19] "en"              "la"              "que"            
## [22] "get"             "going"           "hes"            
## [25] "just"            "amp"             "one"            
## [28] "democrats"       "house"           "go"             
## [31] "russia"          "now"             "even"           
## [34] "putin"           "military"        "paris"          
## [37] "us"              "day"             "im"             
## [40] "didnt"           "still"           "Ã"              
## [43] "american"        "america"         "cant"           
## [46] "donald"          "le"              "can"            
## [49] "never"           "vote"

freq.terms1[1:50]

##  [1] "akparti"                 "bu"                     
##  [3] "da"                      "de"                     
##  [5] "eyt"                     "kadar"                  
##  [7] "rterdogan"               "sayn"                   
##  [9] "Ã"                       "mhpbilgi"               
## [11] "n"                       "numankurtulmus"         
## [13] "sÃ"                      "yok"                    
## [15] "atatÃ"                   "bir"                    
## [17] "daha"                    "dÃ"                     
## [19] "en"                      "erdoan"                 
## [21] "geÃ"                     "gÃ"                     
## [23] "in"                      "iÃ"                     
## [25] "ne"                      "nÃ"                     
## [27] "ve"                      "dbdevletbahceli"        
## [29] "ikalin"                  "tcbestepe"              
## [31] "zehrazumruts"            "biz"                    
## [33] "e"                       "emeklilikteyaataklanlar"
## [35] "k"                       "olarak"                 
## [37] "rÃ"                      "yÃ"                     
## [39] "beoydeildir"             "bÃ"                     
## [41] "eytÃ"                    "mÃ"                     
## [43] "ok"                      "olan"                   
## [45] "var"                     "abdulhamitgul"          
## [47] "istiyoruz"               "sn"                     
## [49] "artk"                    "bekliyoruz"

term.freq <- rowSums(as.matrix(tdm))
term.freq1 <- rowSums(as.matrix(tdm1))
term.freq <- subset(term.freq, term.freq >= 50)
term.freq1 <- subset(term.freq1, term.freq1 >= 30)
df <- data.frame(term = names(term.freq), freq = term.freq)
df1 <- data.frame(term = names(term.freq1), freq = term.freq1)

par(mfrow=c(1,2))
ggplot(df, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

ggplot(df1, aes(x=term, y=freq)) + geom_bar(stat="identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text=element_text(size=7))

Wordcloud

Build Wordcloud

library(wordcloud)

m <- as.matrix(tdm)
m1 <- as.matrix(tdm1)
# calculate the frequency of words and sort it by frequency 
word.freq <- sort(rowSums(m), decreasing = T)
word.freq1 <- sort(rowSums(m1), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")[-(1:5)]

par(mfrow=c(1,2))
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 20,
    random.order = F, colors = pal)
wordcloud(words = names(word.freq1), freq = word.freq1, min.freq = 20,
    random.order = F, colors = pal)