For this blog I think I got a hand on my sentiment analysis and I’m down to just choosing which lexicons I want to use.
library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext) # for text mining
library(gutenbergr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(wordcloud)
## Loading required package: RColorBrewer
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(dplyr)
#{r} #books_corpus <- gutenberg_corpus(c(15776,33310,3300, filter = NULL, mirror = NULL, verbose = TRUE)) #books_corpus #
consequence_of_peace <- gutenberg_download(15776)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
names(consequence_of_peace) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
consequence_of_peace$book <- "The Economic Consequences of the Peace" # replace the gutenberg_id with the book name so it is intuitive
# download the book "Principles of the Politcal Economy and Taxation"
politcal_economy_taxation <- gutenberg_download(33310)
names(politcal_economy_taxation) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
politcal_economy_taxation$book <- "On the Principles of the Political Economy and Taxation" #replace the gutenberg_id with the book name so it is intuitive
# download the book "An Inquiry into the Nature and Causes of the Wealth of Nations" from project gutenberg
wealth_of_nations <- gutenberg_download(3300)
names(wealth_of_nations) <- c("book", "text") # rename the column names so the gutenberg_id column is called book
wealth_of_nations$book <- "An Inquiry into the Nature and Causes of the Wealth of Nations" # replace the gutenberg_id with the book name so it is intuitive
# combine all three books into one dataset
my_corpus <- rbind(wealth_of_nations, politcal_economy_taxation, consequence_of_peace)
my_corpus
## # A tibble: 53,583 x 2
## book text
## <chr> <chr>
## 1 An Inquiry into the Nature and Causes of the Wealth of Nations "An Inquiry i~
## 2 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 3 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 4 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 5 An Inquiry into the Nature and Causes of the Wealth of Nations "by Adam Smit~
## 6 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 7 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 8 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 9 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## 10 An Inquiry into the Nature and Causes of the Wealth of Nations ""
## # ... with 53,573 more rows
#load stop words and sentiment lexicons
data("stop_words")
get_sentiments("afinn") # assigns words with a score that runs between -5 and 5
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
get_sentiments("bing") # categorizes words in a binary fashion into positive and negative categories
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
get_sentiments("nrc") # categorizes words into positive, negative, fear, anger, disgust, anticipation, joy, sadness, surprise and trust
## # A tibble: 13,875 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,865 more rows
get_sentiments("loughran")
## # A tibble: 4,150 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 4,140 more rows
# tokenize the texts from the three books
my_books_tidy_books <- my_corpus %>% group_by(book) %>%
mutate(
linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup() %>% unnest_tokens(word, text)
# filter the joy words from the NRC lexicon
nrc_joy <- get_sentiments("nrc") %>% filter(sentiment == "joy")
# filter the tidy text dataframe in my_books_tidy_books for words from "Wealth of Nations"
my_books_tidy_books %>% filter(book == "An Inquiry into the Nature and Causes of the Wealth of Nations") %>% inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 261 x 2
## word n
## <chr> <int>
## 1 money 770
## 2 present 447
## 3 wages 359
## 4 pay 328
## 5 good 319
## 6 kind 276
## 7 improvement 270
## 8 established 198
## 9 bounty 174
## 10 wealth 171
## # ... with 251 more rows
# get loughran sentiment and filter for only the positive and negative sentiments label
loughran_sentiments <- get_sentiments("loughran") %>% filter(sentiment %in% c("positive", "negative"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment <- my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
my_corpus_sentiment
## # A tibble: 668 x 5
## book index negative positive sentiment
## <chr> <dbl> <int> <int> <int>
## 1 An Inquiry into the Nature and Causes of t~ 0 4 5 1
## 2 An Inquiry into the Nature and Causes of t~ 1 8 14 6
## 3 An Inquiry into the Nature and Causes of t~ 2 2 16 14
## 4 An Inquiry into the Nature and Causes of t~ 3 8 19 11
## 5 An Inquiry into the Nature and Causes of t~ 4 12 23 11
## 6 An Inquiry into the Nature and Causes of t~ 5 5 20 15
## 7 An Inquiry into the Nature and Causes of t~ 6 5 13 8
## 8 An Inquiry into the Nature and Causes of t~ 7 2 12 10
## 9 An Inquiry into the Nature and Causes of t~ 8 3 14 11
## 10 An Inquiry into the Nature and Causes of t~ 9 9 26 17
## # ... with 658 more rows
ggplot(my_corpus_sentiment, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")
The Economic Consequences of the Peace are clearly has the most consistent negative sentiment of the three books when using the loughran lexicon. I suspect it is because it has to do mainly with WWII, rebuilding and reparations. Bringing back the bigrams from my blog post 4 give a rough view into some of the main conversations in the book. The Wealth of Nations has the most postive sentiment score although it also has a brief but extreme negative sentiment just about 3/4 of the way through the book. The book Principles of the Political Economy and Taxation is the most neurtal on sentment overall.
# get Bing sentiment and filter for only the positive and negative sentiments label
bing_sentiments <- get_sentiments("bing") %>% filter(sentiment %in% c("positive", "negative"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment_bing <- my_books_tidy_books %>% inner_join(bing_sentiments) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(my_corpus_sentiment_bing, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")
The positive sentiment increases notably with the use of the Bing lexicon for sentiment analysis.
# get Bing sentiment and filter for only the positive and negative sentiments label
nrc_sentiments <- get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative","fear", "anger", "disgust", "anticipation", "joy", "sadness", "surprise", "trust"))
# sentiment for my corpus books using the loughran lexicon
my_corpus_sentiment_nrc <- my_books_tidy_books %>% inner_join(nrc_sentiments) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(my_corpus_sentiment_nrc, aes(index, sentiment, fill = book)) + geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 3, scales = "free_x")
Using the nrc lexicon the books all appear to have more positive sentiment than with both the loughran and bing lexicons. nrc does take into account a wide range of emotion which I think is the reason.
## Joining, by = "word"
## # A tibble: 5,818 x 2
## word n
## <chr> <int>
## 1 germany 417
## 2 german 245
## 3 war 226
## 4 treaty 173
## 5 allies 156
## 6 europe 152
## 7 france 146
## 8 reparation 142
## 9 economic 136
## 10 commission 124
## # ... with 5,808 more rows
## Joining, by = "word"
## # A tibble: 12,270 x 2
## word n
## <chr> <int>
## 1 price 2326
## 2 country 1652
## 3 labour 1645
## 4 produce 1561
## 5 quantity 1335
## 6 money 1318
## 7 capital 1287
## 8 trade 1203
## 9 land 1176
## 10 rent 1045
## # ... with 12,260 more rows
bing_word_counts <- my_books_tidy_books %>% inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>% ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,050 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 great positive 1809
## 2 gold positive 741
## 3 fall negative 498
## 4 well positive 459
## 5 good positive 370
## 6 work positive 347
## 7 improvement positive 313
## 8 sufficient positive 277
## 9 productive positive 272
## 10 afford positive 268
## # ... with 2,040 more rows
The sentiment determination of the corpus as pertains to the Bing Lexicon. I am leaning towards using this measure for my final data because I do not like how the Loughlan one categorizes words like “greater”. I think this classification might misrepresent the text. I may look into other ways of examining the data but not at the present moment.
bing_word_counts %>% group_by(sentiment) %>% slice_max(n,n =10) %>% ungroup() %>%
mutate(word = reorder(word, n)) %>% ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment", y = NULL)
loughran_word_counts <- my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
count(word, sentiment, sort = TRUE) %>% ungroup()
## Joining, by = "word"
loughran_word_counts
## # A tibble: 1,115 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 great positive 1809
## 2 greater positive 1296
## 3 good positive 370
## 4 improvement positive 313
## 5 poor negative 224
## 6 advantage positive 217
## 7 against negative 214
## 8 better positive 212
## 9 monopoly negative 207
## 10 greatest positive 192
## # ... with 1,105 more rows
loughran_word_counts %>% group_by(sentiment) %>% slice_max(n,n =10) %>% ungroup() %>%
mutate(word = reorder(word, n)) %>% ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment", y = NULL)
Below I am playing with a few word clouds because they are a fun, quick representation that is easy to interpret.
my_books_tidy_books %>% anti_join(stop_words) %>% count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
## Warning in wordcloud(word, n, max.words = 100): price could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): produce could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): proportion could not be fit on
## page. It will not be plotted.
# using the bing lexicon
my_books_tidy_books %>% inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(color = c("#cf0a00","#1a954d"), max.words = 100)
## Joining, by = "word"
The Loughran gives what looks like more negative words than the bing. What I find interesting is the word “masters” appearing in the positive light. While it has many positive connotations I know that Smith often uses the word “masters” in a somewhat negative light in phrases such as “masters of mankind” when he wrote “All for ourselves, and nothing for other people, seems, in every age of the world, to have been the vile maxim of the masters of mankind.” so while this is helpful it is defienlty not 100% accurate in its depiction of the texts sentiment.
# using the loughran lexicon
my_books_tidy_books %>% inner_join(loughran_sentiments) %>%
count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(color = c("#cf0a00","#1a954d"), max.words = 100)
## Joining, by = "word"
# using the nrc lexicon
my_books_tidy_books %>% inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>% acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(color = c("#cf0a00","#1a954d","burlywood1","moccasin","orange3","peachpuff4","gainsboro" , "darkviolet", "bisque3" ,"coral2" ), max.words = 100)
## Joining, by = "word"
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## abundance could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## prohibited could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## diminish could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## possession could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## enemy could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## damage could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## inferior could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## larger could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## poverty could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## distress could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## possessed could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## dispose could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## necessity could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## uncertain could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## unequal could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## impossible could not be fit on page. It will not be plotted.
## Warning in comparison.cloud(., color = c("#cf0a00", "#1a954d", "burlywood1", :
## subsidy could not be fit on page. It will not be plotted.
I think that using nrc is a bit too much for across the 3 books and would be better used for anlaysis of one book at a time
colors() Searching for some important words using term frequency analysis
book_words <- my_corpus %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
total_words <- book_words %>% group_by(book) %>% summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 22,592 x 4
## book word n total
## <chr> <chr> <int> <int>
## 1 An Inquiry into the Nature and Causes of the Wealth of Na~ the 32240 382608
## 2 An Inquiry into the Nature and Causes of the Wealth of Na~ of 24294 382608
## 3 An Inquiry into the Nature and Causes of the Wealth of Na~ to 11708 382608
## 4 An Inquiry into the Nature and Causes of the Wealth of Na~ and 10284 382608
## 5 An Inquiry into the Nature and Causes of the Wealth of Na~ in 9637 382608
## 6 On the Principles of the Political Economy and Taxation the 9586 118015
## 7 On the Principles of the Political Economy and Taxation of 6981 118015
## 8 An Inquiry into the Nature and Causes of the Wealth of Na~ a 6673 382608
## 9 The Economic Consequences of the Peace the 5830 70278
## 10 An Inquiry into the Nature and Causes of the Wealth of Na~ it 5391 382608
## # ... with 22,582 more rows
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 425 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).
Of course as always there are a lot of words that we can’t really get any information from as unigrams. I disregard those below.
book_words <- book_words %>%
bind_tf_idf(word, book, n)
book_words
## # A tibble: 22,592 x 7
## book word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 An Inquiry into the Nature and Causes~ the 32240 382608 0.0843 0 0
## 2 An Inquiry into the Nature and Causes~ of 24294 382608 0.0635 0 0
## 3 An Inquiry into the Nature and Causes~ to 11708 382608 0.0306 0 0
## 4 An Inquiry into the Nature and Causes~ and 10284 382608 0.0269 0 0
## 5 An Inquiry into the Nature and Causes~ in 9637 382608 0.0252 0 0
## 6 On the Principles of the Political Ec~ the 9586 118015 0.0812 0 0
## 7 On the Principles of the Political Ec~ of 6981 118015 0.0592 0 0
## 8 An Inquiry into the Nature and Causes~ a 6673 382608 0.0174 0 0
## 9 The Economic Consequences of the Peace the 5830 70278 0.0830 0 0
## 10 An Inquiry into the Nature and Causes~ it 5391 382608 0.0141 0 0
## # ... with 22,582 more rows
Something useful is coming!
book_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 22,592 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 On the Principles of the Political Economy~ labo~ 629 0.00533 0.405 2.16e-3
## 2 The Economic Consequences of the Peace econ~ 136 0.00194 1.10 2.13e-3
## 3 On the Principles of the Political Economy~ corn 565 0.00479 0.405 1.94e-3
## 4 On the Principles of the Political Economy~ rent 545 0.00462 0.405 1.87e-3
## 5 On the Principles of the Political Economy~ tax 483 0.00409 0.405 1.66e-3
## 6 The Economic Consequences of the Peace germ~ 95 0.00135 1.10 1.49e-3
## 7 The Economic Consequences of the Peace germ~ 245 0.00349 0.405 1.41e-3
## 8 The Economic Consequences of the Peace 1919 72 0.00102 1.10 1.13e-3
## 9 An Inquiry into the Nature and Causes of t~ labo~ 1011 0.00264 0.405 1.07e-3
## 10 The Economic Consequences of the Peace alli~ 156 0.00222 0.405 9.00e-4
## # ... with 22,582 more rows
Using a inverse document frequency analysis we can get a better idea of our topics and see how important a word is to the books individually and the corpus as a whole. In this analysis we see that the most important word found in both Principles of the Political Economy and Taxation as well as Wealth of Nations is “labour” and the most important word in Consequences of the Peace is economic. Wealth of Nations and Principles of the Political Economy and Taxation according to this analysis are more alike in content as both have “Labour”, “tax”, “rent” and “corn” at the top of the term frequency - inverse document frequency analysis. The Economic Consequences of the Peace appears to be of a different topic with “german”, “allies”, “reparation”, and “armistice” appearing as the most important words in the document according to the td-idf analysis
book_words %>% arrange(desc(tf_idf))%>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
top_n(20) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x= NULL, y = "tf-idf scores") +
facet_wrap(~book, ncol = 3, scales = "free_y", labeller = as_labeller(my_corpus)) +
coord_flip()
## Selecting by tf_idf
book_words %>%
filter(book == "The Economic Consequences of the Peace") %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 7,122 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 The Economic Consequences of the Peace economic 136 1.94e-3 1.10 2.13e-3
## 2 The Economic Consequences of the Peace germany's 95 1.35e-3 1.10 1.49e-3
## 3 The Economic Consequences of the Peace german 245 3.49e-3 0.405 1.41e-3
## 4 The Economic Consequences of the Peace 1919 72 1.02e-3 1.10 1.13e-3
## 5 The Economic Consequences of the Peace allies 156 2.22e-3 0.405 9.00e-4
## 6 The Economic Consequences of the Peace reparation 141 2.01e-3 0.405 8.13e-4
## 7 The Economic Consequences of the Peace armistice 46 6.55e-4 1.10 7.19e-4
## 8 The Economic Consequences of the Peace commission 123 1.75e-3 0.405 7.10e-4
## 9 The Economic Consequences of the Peace allied 101 1.44e-3 0.405 5.83e-4
## 10 The Economic Consequences of the Peace nationals 37 5.26e-4 1.10 5.78e-4
## # ... with 7,112 more rows
book_words %>%
filter(book == "On the Principles of the Political Economy and Taxation") %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 5,229 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 On the Principles of the Political Economy~ labo~ 629 5.33e-3 0.405 2.16e-3
## 2 On the Principles of the Political Economy~ corn 565 4.79e-3 0.405 1.94e-3
## 3 On the Principles of the Political Economy~ rent 545 4.62e-3 0.405 1.87e-3
## 4 On the Principles of the Political Economy~ tax 483 4.09e-3 0.405 1.66e-3
## 5 On the Principles of the Political Economy~ 4_l 73 6.19e-4 1.10 6.80e-4
## 6 On the Principles of the Political Economy~ 100_l 69 5.85e-4 1.10 6.42e-4
## 7 On the Principles of the Political Economy~ smith 134 1.14e-3 0.405 4.60e-4
## 8 On the Principles of the Political Economy~ 1000~ 49 4.15e-4 1.10 4.56e-4
## 9 On the Principles of the Political Economy~ land~ 128 1.08e-3 0.405 4.40e-4
## 10 On the Principles of the Political Economy~ cloth 119 1.01e-3 0.405 4.09e-4
## # ... with 5,219 more rows
book_words %>%
filter(book == "An Inquiry into the Nature and Causes of the Wealth of Nations") %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 10,241 x 6
## book word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 An Inquiry into the Nature and Causes of t~ labo~ 1011 2.64e-3 0.405 1.07e-3
## 2 An Inquiry into the Nature and Causes of t~ tax 513 1.34e-3 0.405 5.44e-4
## 3 An Inquiry into the Nature and Causes of t~ rent 495 1.29e-3 0.405 5.25e-4
## 4 An Inquiry into the Nature and Causes of t~ corn 445 1.16e-3 0.405 4.72e-4
## 5 An Inquiry into the Nature and Causes of t~ fort~ 109 2.85e-4 1.10 3.13e-4
## 6 An Inquiry into the Nature and Causes of t~ shil~ 256 6.69e-4 0.405 2.71e-4
## 7 An Inquiry into the Nature and Causes of t~ cler~ 88 2.30e-4 1.10 2.53e-4
## 8 An Inquiry into the Nature and Causes of t~ anci~ 224 5.85e-4 0.405 2.37e-4
## 9 An Inquiry into the Nature and Causes of t~ educ~ 81 2.12e-4 1.10 2.33e-4
## 10 An Inquiry into the Nature and Causes of t~ arti~ 72 1.88e-4 1.10 2.07e-4
## # ... with 10,231 more rows