Here we use R and its tidytext and tidyverse libraries to start our analysis. To begin, read the csv file that has been preprocessed via the R code found here. After processing, each row represents a line of text from the report.
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.2 v stringr 1.4.0
## v readr 1.2.1 v forcats 0.3.0
## Warning: package 'stringr' was built under R version 3.5.3
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.3
## Loading required package: RColorBrewer
library(igraph)
## Warning: package 'igraph' was built under R version 3.5.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
muller_report <- read_csv("mueller_report.csv")
## Parsed with column specification:
## cols(
## page = col_double(),
## line = col_double(),
## text = col_character()
## )
Next, load the dataset of common stop words that we’ll exclude from the total list of words to be analyzed. Add “trump” and “intelligence” to a custom stop word list that we’ll use later as needed.
Create the tidy dataframe via unnest_tokens() which converts a dataframe with a text column to have one-token-per-row. Also remove the original stop words and filter out bad rows using regex.
tidy_muller <- muller_report %>% unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!str_detect(word, "^\\d+$"))
## Joining, by = "word"
Create a dataframe of the top words and a word cloud of the top 50 words.
top_words <- tidy_muller %>% count(word, sort=TRUE)
top_words %>% with(wordcloud(word, n, max.words=50))
Use a common sentiment lexicon to get the sentiment of single words in the report and display a graph of the top negative and positive words.
bing_word_counts <- tidy_muller %>% inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort=TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
anti_join(custom_stop_words) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word=reorder(word, n)) %>%
ggplot(aes(word, n, fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales="free_y") +
labs(y="Contribution to sentiment",
x=NULL) +
coord_flip() + theme_minimal() +
ggtitle("Sentiment Analysis of Words in the Muller Report",
"Basic Single Word Method, using Bing Lexicon")
## Joining, by = "word"
## Selecting by n
Create a bigram graph by starting with the original csv file and calling unnest_tokens() again. This time use the ngrams option and n=2 to give us bigrams. Remove the original stop words and filter for bad rows with regex again. Add a count column. Build the graph.
muller_bigrams <- muller_report %>%
unnest_tokens(bigram, text, token="ngrams", n=2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!str_detect(word1, "^\\d+$")) %>%
filter(!str_detect(word2, "^\\d+$"))
muller_bigrams_count <- muller_bigrams %>%
count(word1, word2, sort=TRUE)
bigram_graph <- muller_bigrams_count %>%
filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH f9e24bd DN-- 146 109 --
## + attr: name (v/c), n (e/n)
## + edges from f9e24bd (vertex names):
## [1] u.s ->department trump ->campaign
## [3] special ->counsel ongoing ->matter
## [5] white ->house trump ->jr
## [7] russian ->government trump ->tower
## [9] special ->counsel's personal ->counsel
## [11] trump ->organization candidate ->trump
## [13] counsel's ->office moscow ->project
## [15] foreign ->policy president's->personal
## + ... omitted several edges
library(ggraph)
## Warning: package 'ggraph' was built under R version 3.5.3
a <- grid::arrow(type="closed", length=unit(.15, "inches"))
ggraph(bigram_graph, layout="fr") +
geom_edge_link(aes(edge_alpha=n), show.legend=FALSE,
arrow=a, end_cap=circle(.07, "inches")) +
geom_node_point() +
geom_node_text(aes(label=name), vjust=1, hjust=1) +
theme_void()
Next, let’s view a trend of word sentiment by page number. Use the same single word sentiment lexicon, sum up the total count of positive words minus negative words per page with zero as a default, and color by the sum for that page.
muller_sentiment <- tidy_muller %>%
anti_join(custom_stop_words) %>%
inner_join(get_sentiments("bing")) %>%
count(page,index = line %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
ggplot(muller_sentiment, aes(page, sentiment, fill=sentiment)) +
geom_col(show.legend = FALSE) +
xlab("Page Number") +
ylab("Sentiment Score") +
ggtitle("Per Page Sentiment of Muller Report",
"Single word sentiment, Bing lexicon") +
theme_minimal()
Finally, take a look at the tf-idf of each word to determine which are most important. We’ll break this down by chapter and volume of the report using regex to find chapter breaks.
muller_volume_1 <- muller_report %>% filter(page >= 19) %>% filter(page < 208)
muller_volume_2 <- muller_report %>% filter(page >= 221) %>% filter(page < 395)
muller_v1_tfidf <- muller_volume_1 %>%
filter(!is.na(text)) %>%
mutate(chapter=cumsum(str_detect(text,regex("^[IVX]+\\.", ignore_case=FALSE)))) %>%
unnest_tokens(word, text) %>%
filter(!str_detect(word, "^\\d+$")) %>%
count(chapter, word, sort=TRUE) %>%
bind_tf_idf(word, chapter, n)
muller_v1_tfidf %>%
filter(chapter != 0) %>%
group_by(chapter) %>%
top_n(7, tf_idf) %>%
ungroup() %>%
mutate(word=reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend=FALSE, fill = 'steelblue') +
labs(x=NULL, y="tf-idf") +
facet_wrap(~chapter, ncol=3, scales="free") +
coord_flip() +
ggtitle("7 Highest tf-idf words in each section of Volume 1 of Muller's Report",
"Partitioned by Chapter") + theme_minimal()
muller_v2_tfidf <- muller_volume_2 %>%
filter(!is.na(text)) %>%
mutate(chapter=cumsum(str_detect(text,regex("^[IVX]+\\.", ignore_case=FALSE)))) %>%
unnest_tokens(word, text) %>%
filter(!str_detect(word, "^\\d+$")) %>%
count(chapter, word, sort=TRUE) %>%
bind_tf_idf(word, chapter, n)
muller_v2_tfidf %>%
filter(chapter != 0) %>%
group_by(chapter) %>%
top_n(7, tf_idf) %>%
ungroup() %>%
mutate(word=reorder(word, tf_idf)) %>%
ggplot(aes(word, tf_idf)) +
geom_col(show.legend=FALSE, fill = 'steelblue') +
labs(x=NULL, y="tf-idf") +
facet_wrap(~chapter, ncol=3, scales="free") +
coord_flip() +
ggtitle("7 Highest tf-idf words in each section of Volume 2 of Muller's Report",
"Partitioned by Section") + theme_minimal()
This should get you started as you investigate the Mueller Report for yourself. If you’re looking for suggestions for further analysis, consider finding answers to questions that are on the minds of the public. For example, analytically summarize the sentiment of the report authors regarding a specific topic, such as President Trump’s involvement in the acknowledged Russian campaign interference. This was the focus of Volume 2. Perhaps data science can uncover insights as to the leaning of the report authors on this hot topic. Happy mining!