Text as Data Wordclouds for Final Poster

Pulling in my articles, finding ethics and the surrounding words and making word clouds

Lissie Bates-Haus, Ph.D. https://github.com/lbateshaus (U Mass Amherst DACSS MS Student)https://www.umass.edu/sbs/data-analytics-and-computational-social-science-program/ms
2022-05-01

Load in libraries:

#load in initial necessary libraries

library(quanteda)
library(readr)
library(dplyr)
library(stringr)
library(tidytext)

Load in all articles as a single corpus:

#function to perform pdf to text conversion for many documents

convertpdf2txt <- function(dirpath){
  files <- list.files(dirpath, full.names = T)
  x <- sapply(files, function(x){
  x <- pdftools::pdf_text(x) %>%
  paste(sep = " ") %>%
  stringr::str_replace_all(fixed("\n"), " ") %>%
  stringr::str_replace_all(fixed("\r"), " ") %>%
  stringr::str_replace_all(fixed("\t"), " ") %>%
  stringr::str_replace_all(fixed("\""), " ") %>%
  paste(sep = " ", collapse = " ") %>%
  stringr::str_squish() %>%
  stringr::str_replace_all("- ", "") 
  return(x)
    })
}

Apply the function to the directory and create corpus:

texts <- convertpdf2txt("~/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/All Articles")
#Create Corpus
textsCorpus <- corpus(texts)
#head(textsCorpus)
textsSummary <- summary(textsCorpus)
#head(textsSummary)

Check for metadata:

#check for metadata
docvars(textsCorpus)
data frame with 0 columns and 121 rows

No metadata available.

First, pull in each journal individually (I am suppressing this code as well as all warnings for the sake of brevity, but am happy to provide it upon request).

Add Journal identifier and unique ID to each summary and apply it to the Corpus:

#Not sure if there's a better way to generate a unique ID number than coding this manually

# Journal of Political Science
AJPSsummary$Journal <- "Journal of Political Science"
AJPSsummary$ID <- 1:nrow(AJPSsummary) + 100
docvars(AJPScorpus) <- AJPSsummary

#American Political Science Review
APSRsummary$Journal <- "American Political Science Review"
APSRsummary$ID <- 1:nrow(APSRsummary) + 200
docvars(APSRcorpus) <- APSRsummary

#American Politics Research
APRsummary$Journal <- "American Politics Research"
APRsummary$ID <- 1:nrow(APRsummary) +300
docvars(APRcorpus) <- APRsummary

#Journal of Experimental Political Research
JEPSsummary$Journal <- "Journal of Experimental Political Research"
JEPSsummary$ID <- 1:nrow(JEPSsummary) + 400
docvars(JEPScorpus) <- JEPSsummary

#Political Science Research and Methods
PSRMsummary$Journal <- "Political Science Research and Methods"
PSRMsummary$ID <- 1:nrow(PSRMsummary) + 500
docvars(PSRMcorpus) <- PSRMsummary

#Research and Politics
RAPsummary$Journal <- "Research and Politics"
RAPsummary$ID <- 1:nrow(RAPsummary) + 600
docvars(RAPcorpus) <- RAPsummary

Summary of each journal’s metadata:

names(AJPSsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       
names(APRsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       
names(APSRsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       
names(JEPSsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       
names(PSRMsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       
names(RAPsummary)
[1] "Text"      "Types"     "Tokens"    "Sentences" "Journal"  
[6] "ID"       

Join individual corpora into one combined corpus:

#let's see if this works - surpressing output for the sake of brevity

combinedCorpus <- c(AJPScorpus, APRcorpus, APSRcorpus, JEPScorpus, PSRMcorpus, RAPcorpus)
#docvars(combinedCorpus)

#so I guess I don't need to create a summary? what happens if I do?
combinedCorpusSummary <- docvars(combinedCorpus)

So now I have a corpus with metadata including journal title.

How many documents in my combinedCorpus?

ndoc(combinedCorpus)
[1] 121

Now I’m going to tokenize my combinedCorpus, as well as remove numbers and punctuation:

combinedTokens <- quanteda::tokens(combinedCorpus, 
    remove_punct = T,
    remove_numbers = T)
#print(combinedTokens)

First, I’m going to do a simple term search on the word “ethics” (eventually I will want to use lemmatization, I believe but this is just a start)

kwic_ethics <- quanteda::kwic(combinedTokens, 
                              pattern = c("ethics"))
head(kwic_ethics)
Keyword-in-context with 6 matches.                                                                                                                                                                                                                                                                    
 [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Black Politicians Are More Intrinsically Motivated to Advance Blacks’ Interests_ A Field Experiment Manipulating Political Incentives.pdf, 4963]
 [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Black Politicians Are More Intrinsically Motivated to Advance Blacks’ Interests_ A Field Experiment Manipulating Political Incentives.pdf, 9594]
                                 [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Building Cooperation among Groups in Conflict_ An Experiment on Intersectarian Cooperation in Lebanon.pdf, 4969]
                                                     [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Campaign Finance Transparency Affects Legislators’ Election Outcomes and Behavior.pdf, 7527]
                                     [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Channels for Influence or Maps of Behavior_ A Field Experiment on Social Networks and Cooperation.pdf, 4976]
                   [/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Distorted Communication, Unequal Representation_ Constituents Communicate Less to Representatives Not of Their Race.pdf, 2648]
                                                    
           of the experidiscussion of the | ethics |
         Staff Polity McClendon Gwyneth H | Ethics |
      GROUPS IN CONFLICT Compensation and | Ethics |
      observed used data on congressional | ethics |
 network behave more similarly Hypothesis | Ethics |
         any experiment on human subjects | ethics |
                                                  
 of experimenting on public offiment's            
 of Using Public Officials Haider-Markel          
 in Table The dependent variable                  
 investigations from correlations between scandals
 Second we have three experimental                
 were naturally raises questions about            
nrow(kwic_ethics)
[1] 38

So this gives us 129 matches of words starting with ethic.

Now I’m interested in how many documents it appears in:

#so first I need to make a dfm?

ethicsDfm <- dfm(combinedTokens)
docfreq(ethicsDfm)["ethic"]
ethic 
    3 
docfreq(ethicsDfm)["ethica"]
ethica 
     1 
docfreq(ethicsDfm)["ethics"]
ethics 
    26 
docfreq(ethicsDfm)["ethical"]
ethical 
     37 
docfreq(ethicsDfm)["ethically"]
ethically 
        3 
docfreq(ethicsDfm)["ethicality"]
ethicality 
         1 

For building a vector/df for wordclouds:

I have six search terms (based on visual inspection of my annotated data in a different file):

I think what I want to do is build a list of all the terms found within 10 words of my search terms, sort by frequeancy and build a wordcloud from that.

# now look at a broader window of terms around "ethic"
ethicsWords1 <- kwic(combinedTokens, 
      pattern = c("ethic"),
      window = 10)

Now I’ll repeat this for each searh term (this is where maybe a loop would be good? I don’t really know how to do this though.)

ethicsWords2 <- kwic(combinedTokens, 
      pattern = c("ethica"),
      window = 10)

ethicsWords3 <- kwic(combinedTokens, 
      pattern = c("ethica"),
      window = 10)

ethicsWords4 <- kwic(combinedTokens, 
      pattern = c("ethical"),
      window = 10)

ethicsWords5 <- kwic(combinedTokens, 
      pattern = c("ethicality"),
      window = 10)

ethicsWords6 <- kwic(combinedTokens, 
      pattern = c("ethically"),
      window = 10)

Can I join these all into one dataframe?

ethicsContext <- rbind(ethicsWords1, ethicsWords2)
ethicsContext <- rbind(ethicsContext, ethicsWords3)
ethicsContext <- rbind(ethicsContext, ethicsWords4)
ethicsContext <- rbind(ethicsContext, ethicsWords5)
ethicsContext <- rbind(ethicsContext, ethicsWords6)

Fantastic! Now I have them all in one dataframe!

Explore Common Words

#create a single column for pre and post and bind together
class(ethicsContext)
[1] "kwic"       "data.frame"
ethicsContext <- as.data.frame(ethicsContext)
text <- ethicsContext %>% select(pre)
text <- rename(text, text = pre)
text2 <- ethicsContext %>% select(post)
text2 <- rename(text2, text = post)
text_both <- rbind(text, text2)
ethicsWords <- text_both %>%
  dplyr::select(text) %>%
  unnest_tokens(word, text)

head(ethicsWords)
          word
1 desirability
2          not
3           to
4        admit
5           to
6     overalso

Plot the top 30 words:

library(ggplot2)

# plot the top 30 words
ethicsWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in texts")

Deal with Stop Words

A majority of these seem to be stop words, so let’s fix that!

data("stop_words")
# how many words do you have including the stop words?
nrow(ethicsWords)
[1] 1877
ethicsClean <- ethicsWords %>%
  anti_join(stop_words)

# how many words after removing the stop words?
nrow(ethicsClean)
[1] 1056

Replot top 30

# plot the top 30 words -- notice any issues?
ethicsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in texts")

Remove https and replot:

ethicsClean <- subset(ethicsClean, word!="https")
ethicsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in texts")

And now - wordcloud!

word_cloud <- ethicsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30)

library(wordcloud2)
library(paletteer)
set.seed(50)
#wordcloud2(word_cloud, color = paletteer_c("grDevices::RdBu", 30))
wordcloud2(data = word_cloud, size = .75, 
           color = paletteer_c("ggthemes::Classic Red-Blue", 30))

Now to export it as a png!

setwd("~/DACCS R/Text as Data")

# install webshot
library(webshot)
webshot::install_phantomjs()

# Make the graph
set.seed(50)
#wordcloud2(word_cloud, color = paletteer_c("grDevices::RdBu", 30))
my_graph <- wordcloud2(data = word_cloud, size = .75, 
           color = "random-dark")

my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)

# and in png or pdf
webshot("tmp.html","ethics5.pdf", delay =5, vwidth = 1000, vheight=800)