Second Text Mining Exercise

get libraries

library(pdftools)

## Warning: package 'pdftools' was built under R version 3.6.3

## Using poppler version 0.73.0

library(tm)

## Warning: package 'tm' was built under R version 3.6.3

## Loading required package: NLP

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.6.3

## Loading required package: RColorBrewer

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(ggthemes)

import data from PDF and read its contents with OCR

PoorLawStats <- pdf_ocr_text(pdf = "C:/Users/Owner/Downloads/2956767.pdf", pages = c(2,3,4,9,10), language = "eng", dpi = 1200)

## Converting page 2 to 2956767_2.png... done!
## Converting page 3 to 2956767_3.png... done!
## Converting page 4 to 2956767_4.png... done!
## Converting page 9 to 2956767_9.png... done!
## Converting page 10 to 2956767_10.png... done!

#I dropped the title page from JSTOR, and a few pages that were just graphs that would confuse the OCR.

clean up data in vector format

PoorLawStats <- gsub(PoorLawStats, pattern = "Mr", replacement = "")

PoorLawStats <- gsub(PoorLawStats, pattern = "per ", replacement = "")

PoorLawStats <- gsub(PoorLawStats, pattern = "cent", replacement = "")

#droping a frequent name mentioned
PoorLawStats <- gsub(PoorLawStats, pattern = "Yule", replacement = "")

#PoorLawStats <- gsub(PoorLawStats, pattern = "per", replacement = "")

check the data

#PoorLawStats
#Looks pretty good.

Now I need to put the article data into a Corpus.

#Check available sources.
getSources()

## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

Utilize the VectorSource.

poorLawCorpus <- Corpus(VectorSource(PoorLawStats))

Cleaning the data of extrenious details

#remove punctuation
poorLawCorpus <- tm_map(poorLawCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(poorLawCorpus, removePunctuation): transformation
## drops documents

#make everything lower case
poorLawCorpus <- tm_map(poorLawCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(poorLawCorpus, content_transformer(tolower)):
## transformation drops documents

#remove numbers from my text analysis
poorLawCorpus <- tm_map(poorLawCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(poorLawCorpus, removeNumbers): transformation
## drops documents

#remove short junk words like 'and', 'but', 'a', 'the'...
poorLawCorpus <- tm_map(poorLawCorpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(poorLawCorpus, removeWords,
## stopwords("english")): transformation drops documents

#clean up the blank space. put the document together.
poorLawCorpus <- tm_map(poorLawCorpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(poorLawCorpus, stripWhitespace): transformation
## drops documents

Utilize the Corpus to make a Document Term Matrix.

poorLawDTM <- DocumentTermMatrix(poorLawCorpus)

inspect my DTM

inspect(poorLawDTM)

## <<DocumentTermMatrix (documents: 5, terms: 569)>>
## Non-/sparse entries: 803/2042
## Sparsity           : 72%
## Maximal term length: 23
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs administration average figures general may outrelief pauperism proportion
##    1              1       0       2       1   2         0         0          0
##    2              3       2       2       2   0         3         8          1
##    3              1       9       0       1   2        10         7          4
##    4              2       0       3       2   2         7         6          4
##    5              2       0       2       3   2         4         6          0
##     Terms
## Docs true unions
##    1    0      2
##    2    2      0
##    3    3      8
##    4    4      2
##    5    1      2

Make a list of most used terms

poorLawFreq <- colSums(as.matrix(poorLawDTM))


#Check to make sure the length matches the number of terms
length(poorLawFreq)

## [1] 569

order the list

poorLawOrdered <- order(poorLawFreq,decreasing=TRUE)

Check the head of the ordered list

poorLawFreq[head(poorLawOrdered)]

##      pauperism      outrelief         unions        average           true 
##             27             24             14             11             10 
## administration 
##              9

check the tail of the ordered list

poorLawFreq[tail(poorLawOrdered)]

##           test         theory     undirectly unsuccessfully      wellbeing 
##              1              1              1              1              1 
##     widespread 
##              1

find frequently used terms

findFreqTerms(poorLawDTM,lowfreq=10)

## [1] "unions"    "average"   "outrelief" "pauperism" "true"

find word associations

findAssocs(x = poorLawDTM, terms = "pauperism", corlimit = 0.6)

## $pauperism
##           find     contention        account            due       relation 
##           0.79           0.71           0.70           0.70           0.70 
##          shown           less           true          count      outrelief 
##           0.70           0.68           0.66           0.64           0.63 
## administration          bears     coincident         degree        greater 
##           0.61           0.61           0.61           0.61           0.61 
##         perage     persistent          still            yet 
##           0.61           0.61           0.61           0.61

make a wordcloud

set.seed(1337)
wordcloud(names(poorLawFreq),poorLawFreq,min.freq=8,colors=brewer.pal(7,"Dark2"))
  title(main = "Poor Law Statistics\nCharles Booth", sub = 'Booth, Charles. "Poor Law Statistics." The Economic Journal 6, no. 21 (1896): 70-74. \nAccessed June 10, 2020. doi:10.2307/2956767.')

make a plot of frequently used terms

#Turn my ordered word list into a data frame for ggplot2
poorLawData <- data.frame(term = names(poorLawFreq), occurrences = poorLawFreq)


#ggplot it
p <- ggplot(subset(poorLawData, poorLawFreq > 7), aes(term, occurrences))

p <- p + geom_bar(stat = "identity")

p <- p + theme(axis.text.x = element_text(angle = 45, hjust = 1))

p <- p + labs(title = "Poor Law Statistics\nCharles Booth", caption = 'Booth, Charles. "Poor Law Statistics." The Economic Journal 6, no. 21 (1896): 70-74. \nAccessed June 10, 2020. doi:10.2307/2956767.')


p

#Begin next stage: Sentiment Analysis

load library

library(tidytext)

## Warning: package 'tidytext' was built under R version 3.6.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(textdata)

## Warning: package 'textdata' was built under R version 3.6.3

get data

get_sentiments(lexicon = "afinn")

## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows

get_sentiments("bing")

## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows

get_sentiments("nrc")

## # A tibble: 13,901 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,891 more rows

Change Corpus type and pre-process it for tidy sentiment analysis

poorLawVectorCorpus <- VCorpus(VectorSource(poorLawCorpus))


poorLawTidy <- poorLawVectorCorpus %>% tidy()

poorLawTidy

## # A tibble: 5 x 8
##   author datetimestamp       description heading id    language origin text     
##   <lgl>  <dttm>              <lgl>       <lgl>   <chr> <chr>    <lgl>  <chr>    
## 1 NA     2020-06-16 21:06:54 NA          NA      1     en       NA     "poor la~
## 2 NA     2020-06-16 21:06:54 NA          NA      2     en       NA     "poor la~
## 3 NA     2020-06-16 21:06:54 NA          NA      3     en       NA     " econom~
## 4 NA     2020-06-16 21:06:54 NA          NA      4     en       NA     "poor la~
## 5 NA     2020-06-16 21:06:54 NA          NA      5     en       NA     " econom~

strip away useless information
just the words and the page.

poorLawTidy2 <- poorLawTidy %>% 
  select(id, text)

get the words and text seperated from the rest of the data

poorLawTidy3 <- poorLawTidy %>%
  unnest_tokens(word, text)

show the first 15 words by their pages.

poorLawTidy3 %>%
  select(id, word) %>%
  head(15)

## # A tibble: 15 x 2
##    id    word      
##    <chr> <chr>     
##  1 1     poor      
##  2 1     law       
##  3 1     statistics
##  4 1     september 
##  5 1     ehconomic 
##  6 1     journal   
##  7 1     published 
##  8 1     criticism 
##  9 1     loch      
## 10 1     book      
## 11 1     condition 
## 12 1     aged      
## 13 1     poor      
## 14 1     hoped     
## 15 1     within

clean up the data a bit more

poorLawMoreTidy <- poorLawTidy3 %>% 
  anti_join(get_stopwords()) %>% 
  filter(is.na(as.numeric(word)))

## Joining, by = "word"

## Warning in mask$eval_all_filter(dots, env_filter): NAs introduced by coercion

view cleaned up word list

poorLawMoreTidy %>%
  select(id, word) %>%
  head(15)

## # A tibble: 15 x 2
##    id    word      
##    <chr> <chr>     
##  1 1     poor      
##  2 1     law       
##  3 1     statistics
##  4 1     september 
##  5 1     ehconomic 
##  6 1     journal   
##  7 1     published 
##  8 1     criticism 
##  9 1     loch      
## 10 1     book      
## 11 1     condition 
## 12 1     aged      
## 13 1     poor      
## 14 1     hoped     
## 15 1     within

plot to check the similarity

poorLawMoreTidy %>%
  count(word, sort = TRUE) %>%
  head(8) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip()

looks good

sentimient analysis match words with their sentiment value from aniff database, along with their relative page number.

poorLawWordCounts <- poorLawTidy3 %>%  inner_join(get_sentiments("afinn")) %>% count(id, word, value)

## Joining, by = "word"

define sentiment range

poorLawWordCounts <- poorLawWordCounts %>%
  group_by(id) %>% 
  summarise(sentiment = sum(value*n))

## `summarise()` ungrouping output (override with `.groups` argument)

poorLawWordCounts

## # A tibble: 5 x 2
##   id    sentiment
##   <chr>     <dbl>
## 1 1            -9
## 2 2            -1
## 3 3             7
## 4 4             3
## 5 5            13

sentiment by page

sentimentByPage <- poorLawTidy3 %>% 
  group_by(id) %>% 
  count(word)

sentimentByPage <- inner_join(sentimentByPage, get_sentiments("afinn"))

## Joining, by = "word"

sentimentByPage <- sentimentByPage %>% 
  summarise(pageSentiment = n * value)

## `summarise()` regrouping output by 'id' (override with `.groups` argument)

sentimentByPage <- sentimentByPage %>% summarise(pageSentiment = sum(pageSentiment))

## `summarise()` ungrouping output (override with `.groups` argument)

plot(sentimentByPage$id, sentimentByPage$pageSentiment)

barplot(height = sentimentByPage$pageSentiment, names.arg = sentimentByPage$id, main = "Sentiment by Page")

plot sentiment

ggplot(poorLawWordCounts, aes(id, sentiment, fill = NULL)) +
  geom_col(show.legend = F) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(x = "Selected Page", y = "Aggregate Sentiment", title = "Sentiment in Charles Booth's \n'Poor Law Statistics'", caption = "Finn Årup Nielsen A new ANEW: Evaluation of a word list for sentiment analysis in microblogs. \nProceedings of the ESWC2011 Workshop on 'Making Sense of Microposts': Big things come in small packages \n718 in CEUR Workshop Proceedings 93-98. 2011 May. http://arxiv.org/abs/1103.2903.", subtitle = "Using AFINN lexicon") +
  theme_foundation()

Second Text Mining Exercise

M.G. Barclay

6/9/2020