Rerun of Word Clouds - May 12, 2022

Abstracts Word Cloud

Load in libraries:

#load in initial necessary libraries

library(quanteda)
library(readr)
library(dplyr)
library(stringr)
library(tidytext)
library(tm)
library(lexicon)
library(wordcloud)
library(textstem)
library(tidyverse)
library(quanteda)
library(purrr)
library(rvest)
library(curl)
library(httr)
library(text2vec)
library(LDAvis)
library(caret)
library(randomForest)
library(caTools)
library(ldatuning)

Load in data from csv

setwd("~/DACCS R/Text as Data/Final Project TaD R")
library(readr)

authors_abstracts <- read_csv("authors_abstracts.csv", show_col_types = FALSE)

Explore Common Words

abstractWords <- authors_abstracts %>%
  dplyr::select(Abstract) %>%
  unnest_tokens(word, Abstract)

head(abstractWords)

# A tibble: 6 × 1
  word       
  <chr>      
1 why        
2 are        
3 politicians
4 more       
5 likely     
6 to

plot the top 30 words:

# plot the top 30 words
abstractWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Abstracts")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(abstractWords)

[1] 18378

abstractClean <- abstractWords %>%
  anti_join(stop_words)

# how many words after removing the stop words?
nrow(abstractClean)

[1] 9986

Replot top 30

# plot the top 30 words -- notice any issues?
abstractClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Abstracts (cleaned)")

put top 40 into a dataframe

#Put the top 30 words into its own dataframe?

top40 <- abstractClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40)

# A tibble: 6 × 2
  word            n
  <fct>       <int>
1 experiment    125
2 voters        111
3 field         110
4 political     109
5 information    81
6 social         71

Finally, word cloud!!

library(wordcloud2)
set.seed(200)
wordcloud2(data=top40, color = "random-dark", size = .75)

For some reason, when I export it as a pdf, all terms don’t appear? I don’t know why.

Ethic* Word Cloud

Explore Common Words

ethicWords <- authors_abstracts %>%
  dplyr::select(Paragraph) %>%
  unnest_tokens(word, Paragraph) %>%
  filter(!grepl('[0-9]', word))

ethicWords <- ethicWords[complete.cases(ethicWords), ]

#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)

head(ethicWords)

# A tibble: 6 × 1
  word          
  <chr>         
1 ethical       
2 considerations
3 though        
4 the           
5 human         
6 subjects

plot the top 30 words:

# plot the top 30 words
ethicWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(ethicWords)

[1] 8097

ethicClean <- ethicWords %>%
  anti_join(stop_words) %>%
  filter(!word == "[:digit:]")

# how many words after removing the stop words?
nrow(ethicClean)

[1] 4068

Replot top 30

# plot the top 30 words -- notice any issues?
ethicClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs (cleaned)")

put top 40 into a dataframe

#Put the top 40 words into its own dataframe?

top40ethic <- ethicClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40ethic)

# A tibble: 6 × 2
  word           n
  <fct>      <int>
1 ethical       63
2 research      57
3 study         36
4 experiment    32
5 political     32
6 ethics        30

Finally, word cloud!!

library(wordcloud2)
set.seed(200)
wordcloud2(data=top40ethic, color = "random-dark", size = .75)

Now I am going to drill down just to substantive paragraphs and see what, if any, differentces there are with the overall ethics words.

Ethic* SUBSTANTIVE Word Cloud

Explore Common Words

setwd("~/DACCS R/Text as Data/Final Project TaD R")
library(readr)

authors_ethics <- read_csv("authors_ethics.csv", show_col_types = FALSE)

ethicWordsSub <- authors_ethics %>% select(Paragraph, TypeMention)

ethicWordsSub <- ethicWordsSub %>% dplyr::filter(TypeMention == "substantive") %>%
  dplyr::select(Paragraph) %>%
  unnest_tokens(word, Paragraph) %>%
  filter(!grepl('[0-9]', word))

#ethicWordsS <- ethicWordsS[complete.cases(ethicWordsS), ]

#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)

head(ethicWordsSub)

# A tibble: 6 × 1
  word          
  <chr>         
1 ethical       
2 considerations
3 though        
4 the           
5 human         
6 subjects

plot the top 30 words:

# plot the top 30 words
ethicWordsSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs (substantive mentions only)")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(ethicWordsSub)

[1] 5798

ethicCleanSub <- ethicWordsSub %>%
  anti_join(stop_words) %>%
  filter(!word == "[:digit:]")

# how many words after removing the stop words?
nrow(ethicCleanSub)

[1] 2764

Replot top 30

ethicCleanSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Substantive Paragraphs (cleaned)")

put top 40 into a dataframe

#Put the top 40 words into its own dataframe?

top40ethicSub <- ethicCleanSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40ethicSub)

# A tibble: 6 × 2
  word            n
  <fct>       <int>
1 ethical        45
2 research       33
3 experiment     28
4 field          24
5 study          24
6 experiments    23

Finally, word cloud!!

set.seed(200)
wordcloud2(data=top40ethicSub, color = "random-dark", size = .75)

library(VennDiagram)

ethic40 <- top40ethic %>% select(word)
ethic40sub <- top40ethicSub %>% select(word)

ethicVector <- as.vector(unlist(ethic40))
substantVector <- as.vector(unlist(ethic40sub))

venn.diagram(
  x = list(ethicVector, substantVector), col=c("#440154ff", '#21908dff'),
  category.names = c("All Ethic* Paragraphs" , "Substantive Mentions" ), fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  filename = '#ethicsVenn7.png',
  output=TRUE,  cat.dist = c(-0.125, -0.125), cat.pos = c(-90, 90) )

[1] 1