Pulling in my articles, finding ethics and the surrounding words and making word clouds
#function to perform pdf to text conversion for many documents
convertpdf2txt <- function(dirpath){
files <- list.files(dirpath, full.names = T)
x <- sapply(files, function(x){
x <- pdftools::pdf_text(x) %>%
paste(sep = " ") %>%
stringr::str_replace_all(fixed("\n"), " ") %>%
stringr::str_replace_all(fixed("\r"), " ") %>%
stringr::str_replace_all(fixed("\t"), " ") %>%
stringr::str_replace_all(fixed("\""), " ") %>%
paste(sep = " ", collapse = " ") %>%
stringr::str_squish() %>%
stringr::str_replace_all("- ", "")
return(x)
})
}
#check for metadata
docvars(textsCorpus)
data frame with 0 columns and 121 rows
No metadata available.
First, pull in each journal individually (I am suppressing this code as well as all warnings for the sake of brevity, but am happy to provide it upon request).
#Not sure if there's a better way to generate a unique ID number than coding this manually
# Journal of Political Science
AJPSsummary$Journal <- "Journal of Political Science"
AJPSsummary$ID <- 1:nrow(AJPSsummary) + 100
docvars(AJPScorpus) <- AJPSsummary
#American Political Science Review
APSRsummary$Journal <- "American Political Science Review"
APSRsummary$ID <- 1:nrow(APSRsummary) + 200
docvars(APSRcorpus) <- APSRsummary
#American Politics Research
APRsummary$Journal <- "American Politics Research"
APRsummary$ID <- 1:nrow(APRsummary) +300
docvars(APRcorpus) <- APRsummary
#Journal of Experimental Political Research
JEPSsummary$Journal <- "Journal of Experimental Political Research"
JEPSsummary$ID <- 1:nrow(JEPSsummary) + 400
docvars(JEPScorpus) <- JEPSsummary
#Political Science Research and Methods
PSRMsummary$Journal <- "Political Science Research and Methods"
PSRMsummary$ID <- 1:nrow(PSRMsummary) + 500
docvars(PSRMcorpus) <- PSRMsummary
#Research and Politics
RAPsummary$Journal <- "Research and Politics"
RAPsummary$ID <- 1:nrow(RAPsummary) + 600
docvars(RAPcorpus) <- RAPsummary
Summary of each journal’s metadata:
names(AJPSsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
names(APRsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
names(APSRsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
names(JEPSsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
names(PSRMsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
names(RAPsummary)
[1] "Text" "Types" "Tokens" "Sentences" "Journal"
[6] "ID"
#let's see if this works - surpressing output for the sake of brevity
combinedCorpus <- c(AJPScorpus, APRcorpus, APSRcorpus, JEPScorpus, PSRMcorpus, RAPcorpus)
#docvars(combinedCorpus)
#so I guess I don't need to create a summary? what happens if I do?
combinedCorpusSummary <- docvars(combinedCorpus)
So now I have a corpus with metadata including journal title.
How many documents in my combinedCorpus?
ndoc(combinedCorpus)
[1] 121
Now I’m going to tokenize my combinedCorpus, as well as remove numbers and punctuation:
combinedTokens <- quanteda::tokens(combinedCorpus,
remove_punct = T,
remove_numbers = T)
#print(combinedTokens)
First, I’m going to do a simple term search on the word “ethics” (eventually I will want to use lemmatization, I believe but this is just a start)
Keyword-in-context with 6 matches.
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Black Politicians Are More Intrinsically Motivated to Advance Blacks’ Interests_ A Field Experiment Manipulating Political Incentives.pdf, 4963]
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Black Politicians Are More Intrinsically Motivated to Advance Blacks’ Interests_ A Field Experiment Manipulating Political Incentives.pdf, 9594]
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Building Cooperation among Groups in Conflict_ An Experiment on Intersectarian Cooperation in Lebanon.pdf, 4969]
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Campaign Finance Transparency Affects Legislators’ Election Outcomes and Behavior.pdf, 7527]
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Channels for Influence or Maps of Behavior_ A Field Experiment on Social Networks and Cooperation.pdf, 4976]
[/Users/lissie/DACSS/697D Text as Data/Final Project Materials/Articles pdfs/American Journal of Political Science/Distorted Communication, Unequal Representation_ Constituents Communicate Less to Representatives Not of Their Race.pdf, 2648]
of the experidiscussion of the | ethics |
Staff Polity McClendon Gwyneth H | Ethics |
GROUPS IN CONFLICT Compensation and | Ethics |
observed used data on congressional | ethics |
network behave more similarly Hypothesis | Ethics |
any experiment on human subjects | ethics |
of experimenting on public offiment's
of Using Public Officials Haider-Markel
in Table The dependent variable
investigations from correlations between scandals
Second we have three experimental
were naturally raises questions about
nrow(kwic_ethics)
[1] 38
So this gives us 129 matches of words starting with ethic.
Now I’m interested in how many documents it appears in:
ethic
3
docfreq(ethicsDfm)["ethica"]
ethica
1
docfreq(ethicsDfm)["ethics"]
ethics
26
docfreq(ethicsDfm)["ethical"]
ethical
37
docfreq(ethicsDfm)["ethically"]
ethically
3
docfreq(ethicsDfm)["ethicality"]
ethicality
1
For building a vector/df for wordclouds:
I have six search terms (based on visual inspection of my annotated data in a different file):
I think what I want to do is build a list of all the terms found within 10 words of my search terms, sort by frequeancy and build a wordcloud from that.
Now I’ll repeat this for each searh term (this is where maybe a loop would be good? I don’t really know how to do this though.)
ethicsWords2 <- kwic(combinedTokens,
pattern = c("ethica"),
window = 10)
ethicsWords3 <- kwic(combinedTokens,
pattern = c("ethica"),
window = 10)
ethicsWords4 <- kwic(combinedTokens,
pattern = c("ethical"),
window = 10)
ethicsWords5 <- kwic(combinedTokens,
pattern = c("ethicality"),
window = 10)
ethicsWords6 <- kwic(combinedTokens,
pattern = c("ethically"),
window = 10)
Can I join these all into one dataframe?
Fantastic! Now I have them all in one dataframe!
#create a single column for pre and post and bind together
class(ethicsContext)
[1] "kwic" "data.frame"
ethicsWords <- text_both %>%
dplyr::select(text) %>%
unnest_tokens(word, text)
head(ethicsWords)
word
1 desirability
2 not
3 to
4 admit
5 to
6 overalso
Plot the top 30 words:
library(ggplot2)
# plot the top 30 words
ethicsWords %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in texts")
A majority of these seem to be stop words, so let’s fix that!
[1] 1877
ethicsClean <- ethicsWords %>%
anti_join(stop_words)
# how many words after removing the stop words?
nrow(ethicsClean)
[1] 1056
Replot top 30
# plot the top 30 words -- notice any issues?
ethicsClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in texts")
Remove https and replot:
ethicsClean <- subset(ethicsClean, word!="https")
ethicsClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in texts")
And now - wordcloud!
word_cloud <- ethicsClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30)
library(wordcloud2)
library(paletteer)
set.seed(50)
#wordcloud2(word_cloud, color = paletteer_c("grDevices::RdBu", 30))
wordcloud2(data = word_cloud, size = .75,
color = paletteer_c("ggthemes::Classic Red-Blue", 30))
Now to export it as a png!
setwd("~/DACCS R/Text as Data")
# install webshot
library(webshot)
webshot::install_phantomjs()
# Make the graph
set.seed(50)
#wordcloud2(word_cloud, color = paletteer_c("grDevices::RdBu", 30))
my_graph <- wordcloud2(data = word_cloud, size = .75,
color = "random-dark")
my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)
# and in png or pdf
webshot("tmp.html","ethics5.pdf", delay =5, vwidth = 1000, vheight=800)