Rerun of Word Clouds - May 12, 2022

Rerunning my word clouds now that I’ve figured out the datacleaning somewhat

Lissie Bates-Haus, Ph.D. https://github.com/lbateshaus (U Mass Amherst DACSS MS Student)https://www.umass.edu/sbs/data-analytics-and-computational-social-science-program/ms
2022-05-12

Abstracts Word Cloud

Load in libraries:

Load in data from csv

setwd("~/DACCS R/Text as Data/Final Project TaD R")
library(readr)

authors_abstracts <- read_csv("authors_abstracts.csv", show_col_types = FALSE)

Explore Common Words

abstractWords <- authors_abstracts %>%
  dplyr::select(Abstract) %>%
  unnest_tokens(word, Abstract)

head(abstractWords)
# A tibble: 6 × 1
  word       
  <chr>      
1 why        
2 are        
3 politicians
4 more       
5 likely     
6 to         

plot the top 30 words:

# plot the top 30 words
abstractWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Abstracts")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(abstractWords)
[1] 18378
abstractClean <- abstractWords %>%
  anti_join(stop_words)

# how many words after removing the stop words?
nrow(abstractClean)
[1] 9986

Replot top 30

# plot the top 30 words -- notice any issues?
abstractClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Abstracts (cleaned)")

put top 40 into a dataframe

#Put the top 30 words into its own dataframe?

top40 <- abstractClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40)
# A tibble: 6 × 2
  word            n
  <fct>       <int>
1 experiment    125
2 voters        111
3 field         110
4 political     109
5 information    81
6 social         71

Finally, word cloud!!

library(wordcloud2)
set.seed(200)
wordcloud2(data=top40, color = "random-dark", size = .75)

For some reason, when I export it as a pdf, all terms don’t appear? I don’t know why.

Ethic* Word Cloud

Explore Common Words

ethicWords <- authors_abstracts %>%
  dplyr::select(Paragraph) %>%
  unnest_tokens(word, Paragraph) %>%
  filter(!grepl('[0-9]', word))

ethicWords <- ethicWords[complete.cases(ethicWords), ]

#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)

head(ethicWords)
# A tibble: 6 × 1
  word          
  <chr>         
1 ethical       
2 considerations
3 though        
4 the           
5 human         
6 subjects      

plot the top 30 words:

# plot the top 30 words
ethicWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(ethicWords)
[1] 8097
ethicClean <- ethicWords %>%
  anti_join(stop_words) %>%
  filter(!word == "[:digit:]")

# how many words after removing the stop words?
nrow(ethicClean)
[1] 4068

Replot top 30

# plot the top 30 words -- notice any issues?
ethicClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs (cleaned)")

put top 40 into a dataframe

#Put the top 40 words into its own dataframe?

top40ethic <- ethicClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40ethic)
# A tibble: 6 × 2
  word           n
  <fct>      <int>
1 ethical       63
2 research      57
3 study         36
4 experiment    32
5 political     32
6 ethics        30

Finally, word cloud!!

library(wordcloud2)
set.seed(200)
wordcloud2(data=top40ethic, color = "random-dark", size = .75)

Now I am going to drill down just to substantive paragraphs and see what, if any, differentces there are with the overall ethics words.

Ethic* SUBSTANTIVE Word Cloud

Explore Common Words

setwd("~/DACCS R/Text as Data/Final Project TaD R")
library(readr)

authors_ethics <- read_csv("authors_ethics.csv", show_col_types = FALSE)
ethicWordsSub <- authors_ethics %>% select(Paragraph, TypeMention)

ethicWordsSub <- ethicWordsSub %>% dplyr::filter(TypeMention == "substantive") %>%
  dplyr::select(Paragraph) %>%
  unnest_tokens(word, Paragraph) %>%
  filter(!grepl('[0-9]', word))

#ethicWordsS <- ethicWordsS[complete.cases(ethicWordsS), ]

#url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
#ethicWords <- str_remove_all(ethicWords, url_regex)

head(ethicWordsSub)
# A tibble: 6 × 1
  word          
  <chr>         
1 ethical       
2 considerations
3 though        
4 the           
5 human         
6 subjects      

plot the top 30 words:

# plot the top 30 words
ethicWordsSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Paragraphs (substantive mentions only)")

Deal with Stop Words

data("stop_words")
# how many words do you have including the stop words?
nrow(ethicWordsSub)
[1] 5798
ethicCleanSub <- ethicWordsSub %>%
  anti_join(stop_words) %>%
  filter(!word == "[:digit:]")

# how many words after removing the stop words?
nrow(ethicCleanSub)
[1] 2764

Replot top 30

ethicCleanSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in Ethic* Substantive Paragraphs (cleaned)")

put top 40 into a dataframe

#Put the top 40 words into its own dataframe?

top40ethicSub <- ethicCleanSub %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(40) %>%
  mutate(word = reorder(word, n))

head(top40ethicSub)
# A tibble: 6 × 2
  word            n
  <fct>       <int>
1 ethical        45
2 research       33
3 experiment     28
4 field          24
5 study          24
6 experiments    23

Finally, word cloud!!

set.seed(200)
wordcloud2(data=top40ethicSub, color = "random-dark", size = .75)
library(VennDiagram)

ethic40 <- top40ethic %>% select(word)
ethic40sub <- top40ethicSub %>% select(word)

ethicVector <- as.vector(unlist(ethic40))
substantVector <- as.vector(unlist(ethic40sub))

venn.diagram(
  x = list(ethicVector, substantVector), col=c("#440154ff", '#21908dff'),
  category.names = c("All Ethic* Paragraphs" , "Substantive Mentions" ), fill = c(alpha("#440154ff",0.3), alpha('#21908dff',0.3)),
  filename = '#ethicsVenn7.png',
  output=TRUE,  cat.dist = c(-0.125, -0.125), cat.pos = c(-90, 90) )
[1] 1