For this exercise I have used the FiveThirtyEightDataset. These are the GOP phrases that Candidates repeated the most https://github.com/fivethirtyeight/data/tree/master/repeated-phrases-gop I downloaded the dataset from the website. And performed tidyverse to clean up the tokens, igraph and ggplot for bing sentiment.
mydata <- read.csv("https://raw.githubusercontent.com/BanuB/DATA607FALL2019REPO/master/gop_debate_all1.csv", stringsAsFactors = FALSE)
names(mydata) <- c ("textcol")
#mydata[1:59,]
mydata$textcol[1:10]
## [1] "KELLY: Welcome to the first debate night of the 2016 presidential campaign, live from Quicken Loans Arena in Cleveland, Ohio."
## [2] "I'm Megyn Kelly..."
## [3] "(APPLAUSE)"
## [4] "... along with my co-moderators, Brett Baier and Chris Wallace."
## [5] "Tonight..."
## [6] "(APPLAUSE)"
## [7] "Nice."
## [8] "Tonight, thousands of people here in the Q, along with millions of voters at home will get their very first chance to see the candidates face off in a debate, answering the questions you want answered."
## [9] "BAIER: Less than a year from now, in this very arena, one of these 10 candidates or one of the seven on the previous debate tonight will accept the Republican party's nomination."
## [10] "(APPLAUSE)"
str(mydata)
## 'data.frame': 10007 obs. of 1 variable:
## $ textcol: chr "KELLY: Welcome to the first debate night of the 2016 presidential campaign, live from Quicken Loans Arena in Cleveland, Ohio." "I'm Megyn Kelly..." "(APPLAUSE)" "... along with my co-moderators, Brett Baier and Chris Wallace." ...
new_data <- data.frame(mydata$textcol[1:10000])
names(new_data) <- c ("textcol")
snippet_bigrams <- new_data %>%
unnest_tokens(bigram, textcol, token = "ngrams", n = 2)
nrow(snippet_bigrams)
## [1] 251378
snippet_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 88,252 x 2
## bigram n
## <chr> <int>
## 1 of the 1120
## 2 going to 940
## 3 in the 910
## 4 to be 688
## 5 we have 657
## 6 want to 604
## 7 and i 548
## 8 you know 542
## 9 thank you 532
## 10 we need 507
## # ... with 88,242 more rows
bigrams_separated <- snippet_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 15,646 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 hillary clinton 226
## 2 senator rubio 206
## 3 senator cruz 195
## 4 governor kasich 133
## 5 dr carson 132
## 6 governor bush 131
## 7 barack obama 128
## 8 donald trump 109
## 9 american people 102
## 10 governor christie 87
## # ... with 15,636 more rows
# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
filter(n > 25) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 83d5555 DN-- 109 77 --
## + attr: name (v/c), n (e/n)
## + edges from 83d5555 (vertex names):
## [1] hillary ->clinton senator ->rubio senator ->cruz
## [4] governor ->kasich dr ->carson governor ->bush
## [7] barack ->obama donald ->trump american ->people
## [10] governor ->christie senator ->paul social ->security
## [13] foreign ->policy supreme ->court middle ->east
## [16] north ->korea president ->obama ronald ->reagan
## [19] federal ->government applause ->blitzer trump ->trump
## [22] applause ->baier commercial->break applause ->trump
## + ... omitted several edges
set.seed(2017)
ggraph(bigram_graph, layout = "kk") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
str(new_data)
## 'data.frame': 10000 obs. of 1 variable:
## $ textcol: Factor w/ 7892 levels "' QUICK: Where did I read this and come up with this that you were...",..: 4294 3408 3 96 6395 3 4553 6393 969 3 ...
token_dtm <- new_data %>%
unnest_tokens(word, textcol, token = "ngrams", n = 1)
str(token_dtm)
## 'data.frame': 251379 obs. of 1 variable:
## $ word: chr "kelly" "welcome" "to" "the" ...
unigram_filtered2 <- token_dtm %>%
filter(!word %in% stop_words$word)
str(unigram_filtered2)
## 'data.frame': 85303 obs. of 1 variable:
## $ word: chr "kelly" "debate" "night" "2016" ...
head(unigram_filtered2)
## word
## 1 kelly
## 2 debate
## 3 night
## 4 2016
## 5 presidential
## 6 campaign
#unigram_filtered2 %>% count(word,sort = TRUE)
unigram_filtered2 %>%
inner_join(get_sentiments("bing"), by = c(word = "word") ) %>%
count(sentiment, word) %>%
filter(n >= 30) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment") + ggtitle("Bing Lexicon Sentiment Analysis for corpus")