For this blog I think I got a hand on my sentiment analysis and I’m down to just choosing which lexicons I want to use.

library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext) # for text mining
library(gutenbergr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(wordcloud)
## Loading required package: RColorBrewer
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
library(dplyr)

#{r} #books_corpus <- gutenberg_corpus(c(15776,33310,3300, filter = NULL, mirror = NULL, verbose = TRUE)) #books_corpus #

consequence_of_peace <- gutenberg_download(15776)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
names(consequence_of_peace) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
consequence_of_peace$book <- "The Economic Consequences of the Peace" # replace the gutenberg_id with the book name so it is intuitive
# download the book "Principles of the Politcal Economy and Taxation"
politcal_economy_taxation <- gutenberg_download(33310) 
names(politcal_economy_taxation) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
politcal_economy_taxation$book <- "On the Principles of the Political Economy and Taxation" #replace the gutenberg_id with the book name so it is intuitive

# download the book "An Inquiry into the Nature and Causes of the Wealth of Nations" from project gutenberg
wealth_of_nations <- gutenberg_download(3300) 
names(wealth_of_nations) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
wealth_of_nations$book <- "An Inquiry into the Nature and Causes of the Wealth of Nations" # replace the gutenberg_id with the book name so it is intuitive

# combine all three books into one dataset
my_corpus <- rbind(wealth_of_nations, politcal_economy_taxation, consequence_of_peace)
my_corpus
## # A tibble: 53,583 x 2
##    book                                                           text          
##    <chr>                                                          <chr>         
##  1 An Inquiry into the Nature and Causes of the Wealth of Nations "An Inquiry i~
##  2 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  3 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  4 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  5 An Inquiry into the Nature and Causes of the Wealth of Nations "by Adam Smit~
##  6 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  7 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  8 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
##  9 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
## 10 An Inquiry into the Nature and Causes of the Wealth of Nations ""            
## # ... with 53,573 more rows
#load stop words and sentiment lexicons 
data("stop_words")
get_sentiments("afinn") # assigns words with a score that runs between -5 and 5
## # A tibble: 2,477 x 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ... with 2,467 more rows
get_sentiments("bing") # categorizes words in a binary fashion into positive and negative categories
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
get_sentiments("nrc") # categorizes words into positive, negative, fear, anger, disgust, anticipation, joy, sadness, surprise and trust
## # A tibble: 13,875 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ... with 13,865 more rows
get_sentiments("loughran")
## # A tibble: 4,150 x 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ... with 4,140 more rows

Sentiment Analysis with inner join on WON

# tokenize the texts from the three books 
my_books_tidy_books <- my_corpus %>% group_by(book) %>%
  mutate(
    linenumber = row_number(),
    chapter = cumsum(str_detect(text, 
                                regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>% unnest_tokens(word, text)

# filter the joy words from the NRC lexicon
nrc_joy <- get_sentiments("nrc") %>% filter(sentiment == "joy")

# filter the tidy text dataframe in my_books_tidy_books for words from "Wealth of Nations"
my_books_tidy_books %>% filter(book == "An Inquiry into the Nature and Causes of the Wealth of Nations") %>% inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 261 x 2
##    word            n
##    <chr>       <int>
##  1 money         770
##  2 present       447
##  3 wages         359
##  4 pay           328
##  5 good          319
##  6 kind          276
##  7 improvement   270
##  8 established   198
##  9 bounty        174
## 10 wealth        171
## # ... with 251 more rows

Sentiment Analysis across my books using the loughran lexicon

# get loughran sentiment and filter for only the positive and negative sentiments label
loughran_sentiments <- get_sentiments("loughran") %>% filter(sentiment %in% c("positive", "negative"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment <- my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
my_corpus_sentiment
## # A tibble: 668 x 5
##    book                                        index negative positive sentiment
##    <chr>                                       <dbl>    <int>    <int>     <int>
##  1 An Inquiry into the Nature and Causes of t~     0        4        5         1
##  2 An Inquiry into the Nature and Causes of t~     1        8       14         6
##  3 An Inquiry into the Nature and Causes of t~     2        2       16        14
##  4 An Inquiry into the Nature and Causes of t~     3        8       19        11
##  5 An Inquiry into the Nature and Causes of t~     4       12       23        11
##  6 An Inquiry into the Nature and Causes of t~     5        5       20        15
##  7 An Inquiry into the Nature and Causes of t~     6        5       13         8
##  8 An Inquiry into the Nature and Causes of t~     7        2       12        10
##  9 An Inquiry into the Nature and Causes of t~     8        3       14        11
## 10 An Inquiry into the Nature and Causes of t~     9        9       26        17
## # ... with 658 more rows
ggplot(my_corpus_sentiment, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 3, scales = "free_x")

The Economic Consequences of the Peace are clearly has the most consistent negative sentiment of the three books when using the loughran lexicon. I suspect it is because it has to do mainly with WWII, rebuilding and reparations. Bringing back the bigrams from my blog post 4 give a rough view into some of the main conversations in the book. The Wealth of Nations has the most postive sentiment score although it also has a brief but extreme negative sentiment just about 3/4 of the way through the book. The book Principles of the Political Economy and Taxation is the most neurtal on sentment overall.

sentiment analysis across books using Bing lexicon

# get Bing sentiment and filter for only the positive and negative sentiments label
bing_sentiments <- get_sentiments("bing") %>% filter(sentiment %in% c("positive", "negative"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment_bing <- my_books_tidy_books %>% inner_join(bing_sentiments) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(my_corpus_sentiment_bing, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 3, scales = "free_x")

The positive sentiment increases notably with the use of the Bing lexicon for sentiment analysis.

sentiment analysis across books using nrc lexicon

# get Bing sentiment and filter for only the positive and negative sentiments label
nrc_sentiments <- get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative","fear", "anger", "disgust", "anticipation", "joy", "sadness", "surprise", "trust"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment_nrc <- my_books_tidy_books %>% inner_join(nrc_sentiments) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(my_corpus_sentiment_nrc, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) + 
  facet_wrap(~book, ncol = 3, scales = "free_x")

Using the nrc lexicon the books all appear to have more positive sentiment than with both the loughran and bing lexicons. nrc does take into account a wide range of emotion which I think is the reason.

## Joining, by = "word"
## # A tibble: 5,818 x 2
##    word           n
##    <chr>      <int>
##  1 germany      417
##  2 german       245
##  3 war          226
##  4 treaty       173
##  5 allies       156
##  6 europe       152
##  7 france       146
##  8 reparation   142
##  9 economic     136
## 10 commission   124
## # ... with 5,808 more rows
## Joining, by = "word"
## # A tibble: 12,270 x 2
##    word         n
##    <chr>    <int>
##  1 price     2326
##  2 country   1652
##  3 labour    1645
##  4 produce   1561
##  5 quantity  1335
##  6 money     1318
##  7 capital   1287
##  8 trade     1203
##  9 land      1176
## 10 rent      1045
## # ... with 12,260 more rows
bing_word_counts <- my_books_tidy_books %>% inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>% ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,050 x 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 great       positive   1809
##  2 gold        positive    741
##  3 fall        negative    498
##  4 well        positive    459
##  5 good        positive    370
##  6 work        positive    347
##  7 improvement positive    313
##  8 sufficient  positive    277
##  9 productive  positive    272
## 10 afford      positive    268
## # ... with 2,040 more rows

The sentiment determination of the corpus as pertains to the Bing Lexicon. I am leaning towards using this measure for my final data because I do not like how the Loughlan one categorizes words like “greater”. I think this classification might misrepresent the text. I may look into other ways of examining the data but not at the present moment.

bing_word_counts %>% group_by(sentiment) %>% slice_max(n,n =10) %>% ungroup() %>%
  mutate(word = reorder(word, n)) %>% ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment", y = NULL)

loughran_word_counts <- my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>% ungroup()
## Joining, by = "word"
loughran_word_counts
## # A tibble: 1,115 x 3
##    word        sentiment     n
##    <chr>       <chr>     <int>
##  1 great       positive   1809
##  2 greater     positive   1296
##  3 good        positive    370
##  4 improvement positive    313
##  5 poor        negative    224
##  6 advantage   positive    217
##  7 against     negative    214
##  8 better      positive    212
##  9 monopoly    negative    207
## 10 greatest    positive    192
## # ... with 1,105 more rows
loughran_word_counts %>% group_by(sentiment) %>% slice_max(n,n =10) %>% ungroup() %>%
  mutate(word = reorder(word, n)) %>% ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment", y = NULL)

Below I am playing with a few word clouds because they are a fun, quick representation that is easy to interpret.

my_books_tidy_books %>% anti_join(stop_words) %>% count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): price could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): produce could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): proportion could not be fit on
## page. It will not be plotted.

# using the bing lexicon
my_books_tidy_books %>% inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(color = c("#cf0a00","#1a954d"), max.words = 100)
## Joining, by = "word"

The Loughran gives what looks like more negative words than the bing. What I find interesting is the word “masters” appearing in the positive light. While it has many positive connotations I know that Smith often uses the word “masters” in a somewhat negative light in phrases such as “masters of mankind” when he wrote “All for ourselves, and nothing for other people, seems, in every age of the world, to have been the vile maxim of the masters of mankind.” so while this is helpful it is defienlty not 100% accurate in its depiction of the texts sentiment.

# using the loughran lexicon
my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
  count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(color = c("#cf0a00","#1a954d"), max.words = 100)
## Joining, by = "word"

# using the nrc lexicon
my_books_tidy_books %>% inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(color = c("#cf0a00","#1a954d","burlywood1","moccasin","orange3","peachpuff4","gainsboro" , "darkviolet", "bisque3"  ,"coral2" ), max.words = 100)
## Joining, by = "word"
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## abundance could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## prohibited could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## diminish could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## possession could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## enemy could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## damage could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## inferior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## larger could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## poverty could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## distress could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## possessed could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## dispose could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## necessity could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## uncertain could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## unequal could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## impossible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## subsidy could not be fit on page. It will not be plotted.

I think that using nrc is a bit too much for across the 3 books and would be better used for anlaysis of one book at a time

colors() Searching for some important words using term frequency analysis

book_words <- my_corpus %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

total_words <- book_words %>% group_by(book) %>% summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 22,592 x 4
##    book                                                       word      n  total
##    <chr>                                                      <chr> <int>  <int>
##  1 An Inquiry into the Nature and Causes of the Wealth of Na~ the   32240 382608
##  2 An Inquiry into the Nature and Causes of the Wealth of Na~ of    24294 382608
##  3 An Inquiry into the Nature and Causes of the Wealth of Na~ to    11708 382608
##  4 An Inquiry into the Nature and Causes of the Wealth of Na~ and   10284 382608
##  5 An Inquiry into the Nature and Causes of the Wealth of Na~ in     9637 382608
##  6 On the Principles of the Political Economy and Taxation    the    9586 118015
##  7 On the Principles of the Political Economy and Taxation    of     6981 118015
##  8 An Inquiry into the Nature and Causes of the Wealth of Na~ a      6673 382608
##  9 The Economic Consequences of the Peace                     the    5830  70278
## 10 An Inquiry into the Nature and Causes of the Wealth of Na~ it     5391 382608
## # ... with 22,582 more rows
ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 425 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).

Of course as always there are a lot of words that we can’t really get any information from as unigrams. I disregard those below.

book_words <- book_words %>%
  bind_tf_idf(word, book, n)
book_words
## # A tibble: 22,592 x 7
##    book                                   word      n  total     tf   idf tf_idf
##    <chr>                                  <chr> <int>  <int>  <dbl> <dbl>  <dbl>
##  1 An Inquiry into the Nature and Causes~ the   32240 382608 0.0843     0      0
##  2 An Inquiry into the Nature and Causes~ of    24294 382608 0.0635     0      0
##  3 An Inquiry into the Nature and Causes~ to    11708 382608 0.0306     0      0
##  4 An Inquiry into the Nature and Causes~ and   10284 382608 0.0269     0      0
##  5 An Inquiry into the Nature and Causes~ in     9637 382608 0.0252     0      0
##  6 On the Principles of the Political Ec~ the    9586 118015 0.0812     0      0
##  7 On the Principles of the Political Ec~ of     6981 118015 0.0592     0      0
##  8 An Inquiry into the Nature and Causes~ a      6673 382608 0.0174     0      0
##  9 The Economic Consequences of the Peace the    5830  70278 0.0830     0      0
## 10 An Inquiry into the Nature and Causes~ it     5391 382608 0.0141     0      0
## # ... with 22,582 more rows

Something useful is coming!

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 22,592 x 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <int>   <dbl> <dbl>   <dbl>
##  1 On the Principles of the Political Economy~ labo~   629 0.00533 0.405 2.16e-3
##  2 The Economic Consequences of the Peace      econ~   136 0.00194 1.10  2.13e-3
##  3 On the Principles of the Political Economy~ corn    565 0.00479 0.405 1.94e-3
##  4 On the Principles of the Political Economy~ rent    545 0.00462 0.405 1.87e-3
##  5 On the Principles of the Political Economy~ tax     483 0.00409 0.405 1.66e-3
##  6 The Economic Consequences of the Peace      germ~    95 0.00135 1.10  1.49e-3
##  7 The Economic Consequences of the Peace      germ~   245 0.00349 0.405 1.41e-3
##  8 The Economic Consequences of the Peace      1919     72 0.00102 1.10  1.13e-3
##  9 An Inquiry into the Nature and Causes of t~ labo~  1011 0.00264 0.405 1.07e-3
## 10 The Economic Consequences of the Peace      alli~   156 0.00222 0.405 9.00e-4
## # ... with 22,582 more rows

Using a inverse document frequency analysis we can get a better idea of our topics and see how important a word is to the books individually and the corpus as a whole. In this analysis we see that the most important word found in both Principles of the Political Economy and Taxation as well as Wealth of Nations is “labour” and the most important word in Consequences of the Peace is economic. Wealth of Nations and Principles of the Political Economy and Taxation according to this analysis are more alike in content as both have “Labour”, “tax”, “rent” and “corn” at the top of the term frequency - inverse document frequency analysis. The Economic Consequences of the Peace appears to be of a different topic with “german”, “allies”, “reparation”, and “armistice” appearing as the most important words in the document according to the td-idf analysis

book_words %>% arrange(desc(tf_idf))%>% 
  mutate(word = factor(word, levels = rev(unique(word))))    %>%
  group_by(book) %>% 
  top_n(20) %>% 
  ungroup() %>% 
  ggplot(aes(word, tf_idf, fill = book)) + 
  geom_col(show.legend = FALSE) + 
  labs(x= NULL, y = "tf-idf scores") + 
  facet_wrap(~book, ncol = 3, scales = "free_y", labeller = as_labeller(my_corpus)) + 
  coord_flip()
## Selecting by tf_idf

book_words %>%
  filter(book == "The Economic Consequences of the Peace") %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 7,122 x 6
##    book                                   word           n      tf   idf  tf_idf
##    <chr>                                  <chr>      <int>   <dbl> <dbl>   <dbl>
##  1 The Economic Consequences of the Peace economic     136 1.94e-3 1.10  2.13e-3
##  2 The Economic Consequences of the Peace germany's     95 1.35e-3 1.10  1.49e-3
##  3 The Economic Consequences of the Peace german       245 3.49e-3 0.405 1.41e-3
##  4 The Economic Consequences of the Peace 1919          72 1.02e-3 1.10  1.13e-3
##  5 The Economic Consequences of the Peace allies       156 2.22e-3 0.405 9.00e-4
##  6 The Economic Consequences of the Peace reparation   141 2.01e-3 0.405 8.13e-4
##  7 The Economic Consequences of the Peace armistice     46 6.55e-4 1.10  7.19e-4
##  8 The Economic Consequences of the Peace commission   123 1.75e-3 0.405 7.10e-4
##  9 The Economic Consequences of the Peace allied       101 1.44e-3 0.405 5.83e-4
## 10 The Economic Consequences of the Peace nationals     37 5.26e-4 1.10  5.78e-4
## # ... with 7,112 more rows
book_words %>%
  filter(book == "On the Principles of the Political Economy and Taxation") %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 5,229 x 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <int>   <dbl> <dbl>   <dbl>
##  1 On the Principles of the Political Economy~ labo~   629 5.33e-3 0.405 2.16e-3
##  2 On the Principles of the Political Economy~ corn    565 4.79e-3 0.405 1.94e-3
##  3 On the Principles of the Political Economy~ rent    545 4.62e-3 0.405 1.87e-3
##  4 On the Principles of the Political Economy~ tax     483 4.09e-3 0.405 1.66e-3
##  5 On the Principles of the Political Economy~ 4_l      73 6.19e-4 1.10  6.80e-4
##  6 On the Principles of the Political Economy~ 100_l    69 5.85e-4 1.10  6.42e-4
##  7 On the Principles of the Political Economy~ smith   134 1.14e-3 0.405 4.60e-4
##  8 On the Principles of the Political Economy~ 1000~    49 4.15e-4 1.10  4.56e-4
##  9 On the Principles of the Political Economy~ land~   128 1.08e-3 0.405 4.40e-4
## 10 On the Principles of the Political Economy~ cloth   119 1.01e-3 0.405 4.09e-4
## # ... with 5,219 more rows
book_words %>%
  filter(book == "An Inquiry into the Nature and Causes of the Wealth of Nations") %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 10,241 x 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <int>   <dbl> <dbl>   <dbl>
##  1 An Inquiry into the Nature and Causes of t~ labo~  1011 2.64e-3 0.405 1.07e-3
##  2 An Inquiry into the Nature and Causes of t~ tax     513 1.34e-3 0.405 5.44e-4
##  3 An Inquiry into the Nature and Causes of t~ rent    495 1.29e-3 0.405 5.25e-4
##  4 An Inquiry into the Nature and Causes of t~ corn    445 1.16e-3 0.405 4.72e-4
##  5 An Inquiry into the Nature and Causes of t~ fort~   109 2.85e-4 1.10  3.13e-4
##  6 An Inquiry into the Nature and Causes of t~ shil~   256 6.69e-4 0.405 2.71e-4
##  7 An Inquiry into the Nature and Causes of t~ cler~    88 2.30e-4 1.10  2.53e-4
##  8 An Inquiry into the Nature and Causes of t~ anci~   224 5.85e-4 0.405 2.37e-4
##  9 An Inquiry into the Nature and Causes of t~ educ~    81 2.12e-4 1.10  2.33e-4
## 10 An Inquiry into the Nature and Causes of t~ arti~    72 1.88e-4 1.10  2.07e-4
## # ... with 10,231 more rows