get libraries
library(pdftools)
## Warning: package 'pdftools' was built under R version 3.6.3
## Using poppler version 0.73.0
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ggthemes)
import data from PDF and read its contents with OCR
PoorLawStats <- pdf_ocr_text(pdf = "C:/Users/Owner/Downloads/2956767.pdf", pages = c(2,3,4,9,10), language = "eng", dpi = 1200)
## Converting page 2 to 2956767_2.png... done!
## Converting page 3 to 2956767_3.png... done!
## Converting page 4 to 2956767_4.png... done!
## Converting page 9 to 2956767_9.png... done!
## Converting page 10 to 2956767_10.png... done!
#I dropped the title page from JSTOR, and a few pages that were just graphs that would confuse the OCR.
clean up data in vector format
PoorLawStats <- gsub(PoorLawStats, pattern = "Mr", replacement = "")
PoorLawStats <- gsub(PoorLawStats, pattern = "per ", replacement = "")
PoorLawStats <- gsub(PoorLawStats, pattern = "cent", replacement = "")
#droping a frequent name mentioned
PoorLawStats <- gsub(PoorLawStats, pattern = "Yule", replacement = "")
#PoorLawStats <- gsub(PoorLawStats, pattern = "per", replacement = "")
check the data
#PoorLawStats
#Looks pretty good.
Now I need to put the article data into a Corpus.
#Check available sources.
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
Utilize the VectorSource.
poorLawCorpus <- Corpus(VectorSource(PoorLawStats))
Cleaning the data of extrenious details
#remove punctuation
poorLawCorpus <- tm_map(poorLawCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(poorLawCorpus, removePunctuation): transformation
## drops documents
#make everything lower case
poorLawCorpus <- tm_map(poorLawCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(poorLawCorpus, content_transformer(tolower)):
## transformation drops documents
#remove numbers from my text analysis
poorLawCorpus <- tm_map(poorLawCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(poorLawCorpus, removeNumbers): transformation
## drops documents
#remove short junk words like 'and', 'but', 'a', 'the'...
poorLawCorpus <- tm_map(poorLawCorpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(poorLawCorpus, removeWords,
## stopwords("english")): transformation drops documents
#clean up the blank space. put the document together.
poorLawCorpus <- tm_map(poorLawCorpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(poorLawCorpus, stripWhitespace): transformation
## drops documents
Utilize the Corpus to make a Document Term Matrix.
poorLawDTM <- DocumentTermMatrix(poorLawCorpus)
inspect my DTM
inspect(poorLawDTM)
## <<DocumentTermMatrix (documents: 5, terms: 569)>>
## Non-/sparse entries: 803/2042
## Sparsity : 72%
## Maximal term length: 23
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs administration average figures general may outrelief pauperism proportion
## 1 1 0 2 1 2 0 0 0
## 2 3 2 2 2 0 3 8 1
## 3 1 9 0 1 2 10 7 4
## 4 2 0 3 2 2 7 6 4
## 5 2 0 2 3 2 4 6 0
## Terms
## Docs true unions
## 1 0 2
## 2 2 0
## 3 3 8
## 4 4 2
## 5 1 2
Make a list of most used terms
poorLawFreq <- colSums(as.matrix(poorLawDTM))
#Check to make sure the length matches the number of terms
length(poorLawFreq)
## [1] 569
order the list
poorLawOrdered <- order(poorLawFreq,decreasing=TRUE)
Check the head of the ordered list
poorLawFreq[head(poorLawOrdered)]
## pauperism outrelief unions average true
## 27 24 14 11 10
## administration
## 9
check the tail of the ordered list
poorLawFreq[tail(poorLawOrdered)]
## test theory undirectly unsuccessfully wellbeing
## 1 1 1 1 1
## widespread
## 1
find frequently used terms
findFreqTerms(poorLawDTM,lowfreq=10)
## [1] "unions" "average" "outrelief" "pauperism" "true"
find word associations
findAssocs(x = poorLawDTM, terms = "pauperism", corlimit = 0.6)
## $pauperism
## find contention account due relation
## 0.79 0.71 0.70 0.70 0.70
## shown less true count outrelief
## 0.70 0.68 0.66 0.64 0.63
## administration bears coincident degree greater
## 0.61 0.61 0.61 0.61 0.61
## perage persistent still yet
## 0.61 0.61 0.61 0.61
make a wordcloud
set.seed(1337)
wordcloud(names(poorLawFreq),poorLawFreq,min.freq=8,colors=brewer.pal(7,"Dark2"))
title(main = "Poor Law Statistics\nCharles Booth", sub = 'Booth, Charles. "Poor Law Statistics." The Economic Journal 6, no. 21 (1896): 70-74. \nAccessed June 10, 2020. doi:10.2307/2956767.')
make a plot of frequently used terms
#Turn my ordered word list into a data frame for ggplot2
poorLawData <- data.frame(term = names(poorLawFreq), occurrences = poorLawFreq)
#ggplot it
p <- ggplot(subset(poorLawData, poorLawFreq > 7), aes(term, occurrences))
p <- p + geom_bar(stat = "identity")
p <- p + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p <- p + labs(title = "Poor Law Statistics\nCharles Booth", caption = 'Booth, Charles. "Poor Law Statistics." The Economic Journal 6, no. 21 (1896): 70-74. \nAccessed June 10, 2020. doi:10.2307/2956767.')
p
#Begin next stage: Sentiment Analysis
load library
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(textdata)
## Warning: package 'textdata' was built under R version 3.6.3
get data
get_sentiments(lexicon = "afinn")
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing")
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
Change Corpus type and pre-process it for tidy sentiment analysis
poorLawVectorCorpus <- VCorpus(VectorSource(poorLawCorpus))
poorLawTidy <- poorLawVectorCorpus %>% tidy()
poorLawTidy
## # A tibble: 5 x 8
## author datetimestamp description heading id language origin text
## <lgl> <dttm> <lgl> <lgl> <chr> <chr> <lgl> <chr>
## 1 NA 2020-06-16 21:06:54 NA NA 1 en NA "poor la~
## 2 NA 2020-06-16 21:06:54 NA NA 2 en NA "poor la~
## 3 NA 2020-06-16 21:06:54 NA NA 3 en NA " econom~
## 4 NA 2020-06-16 21:06:54 NA NA 4 en NA "poor la~
## 5 NA 2020-06-16 21:06:54 NA NA 5 en NA " econom~
strip away useless information
just the words and the page.
poorLawTidy2 <- poorLawTidy %>%
select(id, text)
get the words and text seperated from the rest of the data
poorLawTidy3 <- poorLawTidy %>%
unnest_tokens(word, text)
show the first 15 words by their pages.
poorLawTidy3 %>%
select(id, word) %>%
head(15)
## # A tibble: 15 x 2
## id word
## <chr> <chr>
## 1 1 poor
## 2 1 law
## 3 1 statistics
## 4 1 september
## 5 1 ehconomic
## 6 1 journal
## 7 1 published
## 8 1 criticism
## 9 1 loch
## 10 1 book
## 11 1 condition
## 12 1 aged
## 13 1 poor
## 14 1 hoped
## 15 1 within
clean up the data a bit more
poorLawMoreTidy <- poorLawTidy3 %>%
anti_join(get_stopwords()) %>%
filter(is.na(as.numeric(word)))
## Joining, by = "word"
## Warning in mask$eval_all_filter(dots, env_filter): NAs introduced by coercion
view cleaned up word list
poorLawMoreTidy %>%
select(id, word) %>%
head(15)
## # A tibble: 15 x 2
## id word
## <chr> <chr>
## 1 1 poor
## 2 1 law
## 3 1 statistics
## 4 1 september
## 5 1 ehconomic
## 6 1 journal
## 7 1 published
## 8 1 criticism
## 9 1 loch
## 10 1 book
## 11 1 condition
## 12 1 aged
## 13 1 poor
## 14 1 hoped
## 15 1 within
plot to check the similarity
poorLawMoreTidy %>%
count(word, sort = TRUE) %>%
head(8) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
looks good
sentimient analysis match words with their sentiment value from aniff database, along with their relative page number.
poorLawWordCounts <- poorLawTidy3 %>% inner_join(get_sentiments("afinn")) %>% count(id, word, value)
## Joining, by = "word"
define sentiment range
poorLawWordCounts <- poorLawWordCounts %>%
group_by(id) %>%
summarise(sentiment = sum(value*n))
## `summarise()` ungrouping output (override with `.groups` argument)
poorLawWordCounts
## # A tibble: 5 x 2
## id sentiment
## <chr> <dbl>
## 1 1 -9
## 2 2 -1
## 3 3 7
## 4 4 3
## 5 5 13
sentiment by page
sentimentByPage <- poorLawTidy3 %>%
group_by(id) %>%
count(word)
sentimentByPage <- inner_join(sentimentByPage, get_sentiments("afinn"))
## Joining, by = "word"
sentimentByPage <- sentimentByPage %>%
summarise(pageSentiment = n * value)
## `summarise()` regrouping output by 'id' (override with `.groups` argument)
sentimentByPage <- sentimentByPage %>% summarise(pageSentiment = sum(pageSentiment))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(sentimentByPage$id, sentimentByPage$pageSentiment)
barplot(height = sentimentByPage$pageSentiment, names.arg = sentimentByPage$id, main = "Sentiment by Page")
plot sentiment
ggplot(poorLawWordCounts, aes(id, sentiment, fill = NULL)) +
geom_col(show.legend = F) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "Selected Page", y = "Aggregate Sentiment", title = "Sentiment in Charles Booth's \n'Poor Law Statistics'", caption = "Finn Årup Nielsen A new ANEW: Evaluation of a word list for sentiment analysis in microblogs. \nProceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages \n718 in CEUR Workshop Proceedings 93-98. 2011 May. http://arxiv.org/abs/1103.2903.", subtitle = "Using AFINN lexicon") +
theme_foundation()