Rerunning my word clouds now that I’ve figured out the datacleaning somewhat
Load in libraries:
#load in initial necessary libraries
library(quanteda)
library(readr)
library(dplyr)
library(stringr)
library(tidytext)
library(tm)
library(lexicon)
library(wordcloud)
library(textstem)
library(tidyverse)
library(quanteda)
library(purrr)
library(rvest)
library(curl)
library(httr)
library(text2vec)
library(LDAvis)
library(caret)
library(randomForest)
library(caTools)
library(ldatuning)
Load in data from csv
Explore Common Words
abstractWords <- authors_abstracts %>%
dplyr::select(Abstract) %>%
unnest_tokens(word, Abstract)
head(abstractWords)
# A tibble: 6 × 1
word
<chr>
1 why
2 are
3 politicians
4 more
5 likely
6 to
plot the top 30 words:
# plot the top 30 words
abstractWords %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Abstracts")
Deal with Stop Words
[1] 18378
abstractClean <- abstractWords %>%
anti_join(stop_words)
# how many words after removing the stop words?
nrow(abstractClean)
[1] 9986
Replot top 30
# plot the top 30 words -- notice any issues?
abstractClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Abstracts (cleaned)")
put top 40 into a dataframe
#Put the top 30 words into its own dataframe?
top40 <- abstractClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(40) %>%
mutate(word = reorder(word, n))
head(top40)
# A tibble: 6 × 2
word n
<fct> <int>
1 experiment 125
2 voters 111
3 field 110
4 political 109
5 information 81
6 social 71
Finally, word cloud!!
library(wordcloud2)
set.seed(200)
wordcloud2(data=top40, color = "random-dark", size = .75)
For some reason, when I export it as a pdf, all terms don’t appear? I don’t know why.
Explore Common Words
ethicWords <- authors_abstracts %>%
dplyr::select(Paragraph) %>%
unnest_tokens(word, Paragraph) %>%
filter(!grepl('[0-9]', word))
ethicWords <- ethicWords[complete.cases(ethicWords), ]
#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)
head(ethicWords)
# A tibble: 6 × 1
word
<chr>
1 ethical
2 considerations
3 though
4 the
5 human
6 subjects
plot the top 30 words:
# plot the top 30 words
ethicWords %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Ethic* Paragraphs")
Deal with Stop Words
[1] 8097
ethicClean <- ethicWords %>%
anti_join(stop_words) %>%
filter(!word == "[:digit:]")
# how many words after removing the stop words?
nrow(ethicClean)
[1] 4068
Replot top 30
# plot the top 30 words -- notice any issues?
ethicClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Ethic* Paragraphs (cleaned)")
put top 40 into a dataframe
#Put the top 40 words into its own dataframe?
top40ethic <- ethicClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(40) %>%
mutate(word = reorder(word, n))
head(top40ethic)
# A tibble: 6 × 2
word n
<fct> <int>
1 ethical 63
2 research 57
3 study 36
4 experiment 32
5 political 32
6 ethics 30
Finally, word cloud!!
library(wordcloud2)
set.seed(200)
wordcloud2(data=top40ethic, color = "random-dark", size = .75)
Now I am going to drill down just to substantive paragraphs and see what, if any, differentces there are with the overall ethics words.
Explore Common Words
ethicWordsSub <- authors_ethics %>% select(Paragraph, TypeMention)
ethicWordsSub <- ethicWordsSub %>% dplyr::filter(TypeMention == "substantive") %>%
dplyr::select(Paragraph) %>%
unnest_tokens(word, Paragraph) %>%
filter(!grepl('[0-9]', word))
#ethicWordsS <- ethicWordsS[complete.cases(ethicWordsS), ]
#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)
head(ethicWordsSub)
# A tibble: 6 × 1
word
<chr>
1 ethical
2 considerations
3 though
4 the
5 human
6 subjects
plot the top 30 words:
# plot the top 30 words
ethicWordsSub %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Ethic* Paragraphs (substantive mentions only)")
Deal with Stop Words
[1] 5798
ethicCleanSub <- ethicWordsSub %>%
anti_join(stop_words) %>%
filter(!word == "[:digit:]")
# how many words after removing the stop words?
nrow(ethicCleanSub)
[1] 2764
Replot top 30
ethicCleanSub %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in Ethic* Substantive Paragraphs (cleaned)")
put top 40 into a dataframe
#Put the top 40 words into its own dataframe?
top40ethicSub <- ethicCleanSub %>%
dplyr::count(word, sort = TRUE) %>%
top_n(40) %>%
mutate(word = reorder(word, n))
head(top40ethicSub)
# A tibble: 6 × 2
word n
<fct> <int>
1 ethical 45
2 research 33
3 experiment 28
4 field 24
5 study 24
6 experiments 23
Finally, word cloud!!
set.seed(200)
wordcloud2(data=top40ethicSub, color = "random-dark", size = .75)
library(VennDiagram)
ethic40 <- top40ethic %>% select(word)
ethic40sub <- top40ethicSub %>% select(word)
ethicVector <- as.vector(unlist(ethic40))
substantVector <- as.vector(unlist(ethic40sub))
venn.diagram(
x = list(ethicVector, substantVector), col=c("#440154ff", '#21908dff'),
category.names = c("All Ethic* Paragraphs" , "Substantive Mentions" ), fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
filename = '#ethicsVenn7.png',
output=TRUE, cat.dist = c(-0.125, -0.125), cat.pos = c(-90, 90) )
[1] 1