Code walkthrough for the guest lecture offerred in the Isenberg School of Management on 12/04/2018.
The following libraries are required.
library(data.table) #a library for data cleaning
library(dplyr) #a library for data cleaning
library(dbplyr) #a library for data cleaning
library(RSQLite) #a library for importing data from a SQLite database
library(quanteda) #a library for quantitative textual analysis
library(ggplot2) #for plotting data
library(igraph) #for network analysis
The Stranger Things 2 tweets are stored in the cloud; Here I use the following code to import data from a sqlite database.
db <- src_sqlite("/home/weiaiwaynexu/Dropbox/Acer Laptop Sync/Data Science/Stranger Things MIning/Stranger_Things_Mining.sqlite", create = FALSE)
df <- tbl(db, sql("SELECT * FROM tweets")) #table name in the sqlite database is named "tweets"
df <- collect(df) #load the data into memory
df <- df[!duplicated(df$content),] #remove duplicates
unique(df$query)
## [1] "strangerthings" "stranger_things" "strangerthings2"
Separate the tweet data (the dataframe named df) into two parts, one containing retweets (named rt), and another including only Twitter mentions/replies (named mt).
rt <- df[df$retweeted_status=="THIS IS A RETWEET --> DOUBLE-CHECK JSON",]
nonrt <- df[df$retweeted_status!="THIS IS A RETWEET --> DOUBLE-CHECK JSON",]
#remove non-English tweets
nonrt <- nonrt[nonrt$language=="en",]
print(c("# of retweets:",nrow(rt)))
## [1] "# of retweets:" "21475"
print(c("# of non-retweets:",nrow(nonrt)))
## [1] "# of non-retweets:" "68072"
remove(df) #remove the original df dataframe to conserve disk space.
Create corpus and document-term matrix (or called dfm, short for Document-Feature Matrix)
#create corpus; note that tweets are stored in the column named "content"
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")
#tokenization, remove punctuations, numbers, urls, stop words, user-defined filter words, and words shorter than three characters.
toks <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
toks <- tokens_select(toks, stopwords('en'), selection = 'remove')
toks <- tokens_select(toks, c("rt","stranger","#strangerthings","#strangerthings2","#stranger_things","things","mt","@*"), selection = 'remove')
toks <- tokens_select(toks, min_nchar=3, selection = 'keep')
#create document-featre matrix
toks_dfm <- dfm(toks)
Create a dtm for bigram
ngram <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE, ngrams = 2)
ngram <- tokens_remove(tokens(group_corpus, remove_punct = TRUE,remove_numbers = TRUE, remove_url = TRUE, ngrams = 2), stopwords("english"))
bigram_dfm <- dfm(ngram)
Plot Word frequency
freq <- textstat_frequency(toks_dfm, n = 25)
toks_dfm %>%
textstat_frequency(n = 15) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
freq <- textstat_frequency(bigram_dfm, n = 25)
bigram_dfm %>%
textstat_frequency(n = 15) %>%
ggplot(aes(x = reorder(feature, frequency), y = frequency)) +
geom_point() +
coord_flip() +
labs(x = NULL, y = "Frequency") +
theme_minimal()
Produce a word cloud.
#wordcloud
set.seed(100)
textplot_wordcloud(toks_dfm)
Create a semantic network based on word co-occurrence in tweets
new_dfm <- dfm_trim(toks_dfm, min_termfreq = 10) #create a new dfm to include words that have appeared at least 10 times in the corpus.
new_fcm <- fcm(new_dfm) #create a feature co-occurrence matrix
feat <- names(topfeatures(new_fcm, 100))
new_fcm <- fcm_select(new_fcm, feat)
size <- log(colSums(dfm_select(new_dfm, feat)))
textplot_network(new_fcm, min_freq = 0.3, vertex_size = size / max(size) * 1, edge_size = 1)
Create a semantic network based on hashtag co-occurrence in tweets
#create a new corpus and dfm containing only strings that begin with #
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")
hashtag_dfm <- dfm(group_corpus, remove_punct = TRUE)
ss_dfm <- dfm_select(hashtag_dfm, c('#*'))
ss_fcm <- fcm(ss_dfm)
toptag <- names(topfeatures(ss_fcm, 100))
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)
Create a semantic network based on hashtag co-occurrence in Twitter bio
#create a new corpus and dfm containing only strings that begin with # in Twitter bios
users <- nonrt[!duplicated(nonrt$from_user_id),]
group_corpus <- corpus(users,docid_field = "from_user_id",text_field = "from_user_description")
hashtag_dfm <- dfm(group_corpus, remove_punct = TRUE)
ss_dfm <- dfm_select(hashtag_dfm, c('#*'))
ss_fcm <- fcm(ss_dfm)
ss_dfm
## Document-feature matrix of: 57,771 documents, 9,388 features (100% sparse).
head(ss_fcm)
## Feature co-occurrence matrix of: 6 by 6 features.
## 6 x 6 sparse Matrix of class "fcm"
## features
## features #josephmorgan #billskarsgard #bellarke #maxiosa #reylo
## #josephmorgan 0 1 0 0 0
## #billskarsgard 0 0 0 0 0
## #bellarke 0 0 0 1 1
## #maxiosa 0 0 0 0 1
## #reylo 0 0 0 0 0
## #11amy 0 0 0 0 0
## features
## features #11amy
## #josephmorgan 0
## #billskarsgard 0
## #bellarke 1
## #maxiosa 1
## #reylo 1
## #11amy 0
toptag <- names(topfeatures(ss_fcm, 100))
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)
Create a socio-semantic network based on hashtag and screenname co-occurrence in tweets
#create corpus; note that tweets are stored in the column named "content"
group_corpus <- corpus(nonrt,docid_field = "tweet_id",text_field = "content")
#tokenization, remove punctuations, numbers, urls, stop words, user-defined filter words, and words shorter than three characters.
toks <- tokens(group_corpus,remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
toks <- tokens_select(toks, stopwords('en'), selection = 'remove')
toks <- tokens_select(toks, min_nchar=3, selection = 'keep')
toks_dfm <- dfm(toks)
ss_dfm <- dfm_select(toks_dfm, c('#*','@*'))
ss_fcm <- fcm(ss_dfm)
toptag <- names(topfeatures(ss_fcm, 100))
topgat_fcm <- fcm_select(ss_fcm, toptag)
textplot_network(topgat_fcm, min_freq = 0.1, edge_alpha = 0.8, vertex_size = size / max(size) * 1, edge_size = 1)
Export the network data into a graphml file; We can use Gephi to visualize and analyze the graphml file.
gr <- graph_from_adjacency_matrix(topgat_fcm, weighted = TRUE, diag = FALSE, add.colnames="features")
write.graph(gr,"stranger_things_semantic_userbio.graphml",format="graphml")