Code walkthrough for the guest lecture offerred in the Isenberg School of Management on 12/04/2018.

The following libraries are required.

library(data.table) #a library for data cleaning 
library(dplyr) #a library for data cleaning 
library(dbplyr) #a library for data cleaning 
library(RSQLite) #a library for importing data from a SQLite database
library(quanteda) #a library for quantitative textual analysis
library(ggplot2) #for plotting data
library(igraph) #for network analysis

The Stranger Things 2 tweets are stored in the cloud; Here I use the following code to import data from a sqlite database.

db <- src_sqlite("/home/weiaiwaynexu/Dropbox/Acer Laptop Sync/Data Science/Stranger Things MIning/Stranger_Things_Mining.sqlite", create = FALSE)

df <- tbl(db, sql("SELECT * FROM tweets")) #table name in the sqlite database is named "tweets"

df <- collect(df) #load the data into memory
df <- df[!duplicated(df$content),] #remove duplicates
unique(df$query)
## [1] "strangerthings"  "stranger_things" "strangerthings2"

Separate the tweet data (the dataframe named df) into two parts, one containing retweets (named rt), and another including only Twitter mentions/replies (named mt).

rt <- df[df$retweeted_status=="THIS IS A RETWEET --> DOUBLE-CHECK JSON",]
nonrt <- df[df$retweeted_status!="THIS IS A RETWEET --> DOUBLE-CHECK JSON",]

#remove non-English tweets
nonrt <- nonrt[nonrt$language=="en",]

print(c("# of retweets:",nrow(rt)))
## [1] "# of retweets:" "21475"
print(c("# of non-retweets:",nrow(nonrt)))
## [1] "# of non-retweets:" "68072"
remove(df) #remove the original df dataframe to conserve disk space.

Create corpus and document-term matrix (or called dfm, short for Document-Feature Matrix)

#create corpus; note that tweets are stored in the column named "content" 
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")

#tokenization, remove punctuations, numbers, urls, stop words, user-defined filter words, and words shorter than three characters. 

toks <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
toks <- tokens_select(toks, stopwords('en'), selection = 'remove') 
toks <- tokens_select(toks, c("rt","stranger","#strangerthings","#strangerthings2","#stranger_things","things","mt","@*"), selection = 'remove') 
toks <- tokens_select(toks, min_nchar=3, selection = 'keep')

#create document-featre matrix
toks_dfm <- dfm(toks)

Create a dtm for bigram

ngram <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE, ngrams = 2)
ngram <- tokens_remove(tokens(group_corpus, remove_punct = TRUE,remove_numbers = TRUE, remove_url = TRUE, ngrams = 2), stopwords("english"))
bigram_dfm <- dfm(ngram)

Plot Word frequency

freq <- textstat_frequency(toks_dfm, n = 25)
toks_dfm %>% 
  textstat_frequency(n = 15) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

freq <- textstat_frequency(bigram_dfm, n = 25)
bigram_dfm %>% 
  textstat_frequency(n = 15) %>% 
  ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
  labs(x = NULL, y = "Frequency") +
  theme_minimal()

Produce a word cloud.

#wordcloud
set.seed(100)
textplot_wordcloud(toks_dfm)

Create a semantic network based on word co-occurrence in tweets

new_dfm <- dfm_trim(toks_dfm, min_termfreq = 10) #create a new dfm to include words that have appeared at least 10 times in the corpus.

new_fcm <- fcm(new_dfm) #create a feature co-occurrence matrix
feat <- names(topfeatures(new_fcm, 100))
new_fcm <- fcm_select(new_fcm, feat) 
size <- log(colSums(dfm_select(new_dfm, feat)))
textplot_network(new_fcm, min_freq = 0.3, vertex_size = size / max(size) * 1, edge_size = 1)

Create a semantic network based on hashtag co-occurrence in tweets

#create a new corpus and dfm containing only strings that begin with #
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")
hashtag_dfm <- dfm(group_corpus, remove_punct = TRUE)
ss_dfm <- dfm_select(hashtag_dfm, c('#*'))

ss_fcm <- fcm(ss_dfm)
toptag <- names(topfeatures(ss_fcm, 100)) 
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)

Create a semantic network based on hashtag co-occurrence in Twitter bio

#create a new corpus and dfm containing only strings that begin with # in Twitter bios
users <- nonrt[!duplicated(nonrt$from_user_id),]
group_corpus <- corpus(users,docid_field = "from_user_id",text_field = "from_user_description")
hashtag_dfm <- dfm(group_corpus, remove_punct = TRUE)
ss_dfm <- dfm_select(hashtag_dfm, c('#*'))
ss_fcm <- fcm(ss_dfm)
ss_dfm
## Document-feature matrix of: 57,771 documents, 9,388 features (100% sparse).
head(ss_fcm)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
##                 features
## features         #josephmorgan #billskarsgard #bellarke #maxiosa #reylo
##   #josephmorgan              0              1         0        0      0
##   #billskarsgard             0              0         0        0      0
##   #bellarke                  0              0         0        1      1
##   #maxiosa                   0              0         0        0      1
##   #reylo                     0              0         0        0      0
##   #11amy                     0              0         0        0      0
##                 features
## features         #11amy
##   #josephmorgan       0
##   #billskarsgard      0
##   #bellarke           1
##   #maxiosa            1
##   #reylo              1
##   #11amy              0
toptag <- names(topfeatures(ss_fcm, 100)) 
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)

Create a socio-semantic network based on hashtag and screenname co-occurrence in tweets

#create corpus; note that tweets are stored in the column named "content" 
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")

#tokenization, remove punctuations, numbers, urls, stop words, user-defined filter words, and words shorter than three characters. 
toks <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
toks <- tokens_select(toks, stopwords('en'), selection = 'remove') 
toks <- tokens_select(toks, min_nchar=3, selection = 'keep')
toks_dfm <- dfm(toks)

ss_dfm <- dfm_select(toks_dfm, c('#*','@*'))
ss_fcm <- fcm(ss_dfm)
toptag <- names(topfeatures(ss_fcm, 100)) 
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)

Export the network data into a graphml file; We can use Gephi to visualize and analyze the graphml file.

gr <- graph_from_adjacency_matrix(topgat_fcm, weighted = TRUE, diag = FALSE, add.colnames="features")
write.graph(gr,"stranger_things_semantic_userbio.graphml",format="graphml")