Installing the necessary packages
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: RColorBrewer
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v stringr 1.4.0
## v tidyr 1.2.0 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
## Package version: 3.2.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
Install the first book for analysis using the book ID number on Project Gutenberg. The first book I want to work with is the one that started it all, The Wealth of Nations by Adam Smith. I am
#Looking up the right book # in Project Gutenberg
gutenberg_metadata %>%
filter(title == "An Inquiry into the Nature and Causes of the Wealth of Nations")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 3300 An Inqu~ Smith~ 1158 en Harvard Classic~ Publi~
## # ... with 1 more variable: has_text <lgl>
gutenberg_metadata %>%
filter(title == "The Economic Consequences of the Peace")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 15776 The Eco~ Keyne~ 6280 en World War I/Bes~ Publi~
## # ... with 1 more variable: has_text <lgl>
gutenberg_metadata %>%
filter(title == "Lombard Street: A Description of the Money Market")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 4359 Lombard~ Bageh~ 1461 en <NA> Publi~
## # ... with 1 more variable: has_text <lgl>
gutenberg_metadata %>%
filter(title == "The History of Currency, 1252 to 1896")
## # A tibble: 1 x 8
## gutenberg_id title author gutenberg_autho~ language gutenberg_books~ rights
## <int> <chr> <chr> <int> <chr> <chr> <chr>
## 1 38381 The His~ Shaw,~ 39433 en <NA> Publi~
## # ... with 1 more variable: has_text <lgl>
For now I am going to work with two books. The wealth of nations and economic consequences of the peace.
wealth_of_nations <- gutenberg_download(3300, mirror = "http://mirrors.xmission.com/gutenberg/")
consequence_of_peace <- gutenberg_download(15776, mirror = "http://mirrors.xmission.com/gutenberg/")
## cols(
## gutenberg_id = col_double(),
## text = col_character()
## )
## Rows: 34546 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): text
## dbl (1): gutenberg_id
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 34,546 x 2
## gutenberg_id text
## <dbl> <chr>
## 1 3300 An Inquiry into the Nature and Causes of the Wealth of Nations
## 2 3300 <NA>
## 3 3300 <NA>
## 4 3300 <NA>
## 5 3300 by Adam Smith
## 6 3300 <NA>
## 7 3300 <NA>
## 8 3300 <NA>
## 9 3300 <NA>
## 10 3300 <NA>
## # ... with 34,536 more rows
## Rows: 7284 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): text
## dbl (1): gutenberg_id
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 7,284 x 2
## gutenberg_id text
## <dbl> <chr>
## 1 15776 THE ECONOMIC CONSEQUENCES OF THE PEACE
## 2 15776 <NA>
## 3 15776 by
## 4 15776 <NA>
## 5 15776 JOHN MAYNARD KEYNES, C.B.
## 6 15776 Fellow of King's College, Cambridge
## 7 15776 <NA>
## 8 15776 New York
## 9 15776 Harcourt, Brace and Howe
## 10 15776 <NA>
## # ... with 7,274 more rows
I need to do a bit of cleanup on both of these documents. I
#Remove NA from Wealth of Nations
wealth_of_nations <-
wealth_of_nations %>%
mutate(text = str_remove_all(text, "NA"))
#Remove NA from Consequence of Peace
consequence_of_peace <-
consequence_of_peace %>%
mutate(text = str_remove_all(text, "NA"))
wealth_of_nations
## # A tibble: 34,546 x 2
## gutenberg_id text
## <int> <chr>
## 1 3300 "An Inquiry into the Nature and Causes of the Wealth of Nations"
## 2 3300 ""
## 3 3300 ""
## 4 3300 ""
## 5 3300 "by Adam Smith"
## 6 3300 ""
## 7 3300 ""
## 8 3300 ""
## 9 3300 ""
## 10 3300 ""
## # ... with 34,536 more rows
wealth_of_nations_clean <-
wealth_of_nations %>%
# The actual book doesn't start until line 187
slice(187:n()) %>%
# Get rid of rows where text is missing
drop_na(text) %>%
# Seperated into multiple books. Chapters are continuous through the different books
# cumsum() calculates the cumulative sum, so it'll increase every time
# there's a new book and automatically make book and chapter numbers
mutate(book = str_detect(text, "^BOOK"),
book_number = cumsum(book))%>%
mutate(chapter = str_detect(text, "^CHAPTER"),
chapter_number = cumsum(chapter))%>%
# Remove columns we don't need
select(-book, -chapter, -gutenberg_id)
consequence_of_peace_clean <-
consequence_of_peace %>%
# The actual book doesn't start until line 63
slice(63:n()) %>%
# Get rid of rows where text is missing
drop_na(text) %>%
# Seperated into multiple chapters. Chapters are continuous through the different books
# cumsum() calculates the cumulative sum, so it'll increase every time
# there's a new book and automatically make book and chapter numbers
mutate(chapter = str_detect(text, "^CHAPTER"),
chapter_number = cumsum(chapter))%>%
# Remove columns we don't need
select( -chapter, -gutenberg_id)
wealth_of_nations_clean
## # A tibble: 34,360 x 3
## text book_number chapter_number
## <chr> <int> <int>
## 1 "BOOK I." 1 0
## 2 "OF THE CAUSES OF IMPROVEMENT IN THE" 1 0
## 3 "PRODUCTIVE POWERS OF LABOUR, AND OF THE ORDER AC~ 1 0
## 4 "TURALLY DISTRIBUTED AMONG THE DIFFERENT RANKS OF~ 1 0
## 5 " " 1 0
## 6 "" 1 0
## 7 "" 1 0
## 8 "CHAPTER I." 1 1
## 9 "OF THE DIVISION OF LABOUR." 1 1
## 10 "" 1 1
## # ... with 34,350 more rows
word_frequencies_won <- wealth_of_nations_clean %>%
# The unnest_tokens() functions from tidytext counts words
# or bigram or paragraph to be in its own row
unnest_tokens(word, text) %>%
# Remove stop words
anti_join(stop_words) %>%
# use str_extract() here because the UTF-8 encoded texts
# from Project Gutenberg have some examples of words with
# underscores around them to indicate emphasis (like italics,
# ex: count “_any_” separately from “any” not good for counting word).
mutate(word = str_extract(word, "[a-z']+")) %>%
# Count all the words
count(word, sort = TRUE)
## Joining, by = "word"
#what does this look like?
word_frequencies_won
## # A tibble: 8,966 x 2
## word n
## <chr> <int>
## 1 <NA> 2786
## 2 price 1259
## 3 country 1235
## 4 labour 989
## 5 trade 970
## 6 produce 934
## 7 quantity 796
## 8 people 775
## 9 money 770
## 10 land 717
## # ... with 8,956 more rows
So now I have this pesky NA popping up. I am going to remove them because I definitely don’t want to count them.
#remove na from word frequency
word_frequencies_won <- na.omit(word_frequencies_won, c("n"))
word_frequencies_won
## # A tibble: 8,965 x 2
## word n
## <chr> <int>
## 1 price 1259
## 2 country 1235
## 3 labour 989
## 4 trade 970
## 5 produce 934
## 6 quantity 796
## 7 people 775
## 8 money 770
## 9 land 717
## 10 revenue 685
## # ... with 8,955 more rows
word_frequencies_won %>%
# Keep top 15
top_n(15) %>%
# Make the words an ordered factor so they plot in order
mutate(word = fct_inorder(word)) %>%
ggplot(aes(x = n, y = word))+
geom_col()
## Selecting by n
I’m goinna do the same count for consequence of peace as I did for the wealth of nations above.
word_frequencies_cop <- consequence_of_peace_clean %>%
# The unnest_tokens() functions from tidytext counts words
# or bigram or paragraph to be in its own row
unnest_tokens(word, text) %>%
# Remove stop words
anti_join(stop_words) %>%
# use str_extract() here because the UTF-8 encoded texts
# from Project Gutenberg have some examples of words with
# underscores around them to indicate emphasis (like italics,
# ex: count “_any_” separately from “any” not good for counting word).
mutate(word = str_extract(word, "[a-z']+")) %>%
# Count all the words
count(word, sort = TRUE)
## Joining, by = "word"
word_frequencies_cop <- na.omit(word_frequencies_cop, c("n"))
word_frequencies_cop
## # A tibble: 5,805 x 2
## word n
## <chr> <int>
## 1 germany 417
## 2 german 245
## 3 war 224
## 4 treaty 170
## 5 allies 156
## 6 europe 149
## 7 france 146
## 8 reparation 141
## 9 economic 132
## 10 commission 124
## # ... with 5,795 more rows
word_frequencies_cop %>%
# Keep top 15
top_n(15) %>%
# Make the words an ordered factor so they plot in order
mutate(word = fct_inorder(word)) %>%
ggplot(aes(x = n, y = word))+
geom_col()
## Selecting by n
Economic Consequence of Peace is clearly written at a different time than wealth of nations and seems to focus on the countries affected most by WWI. Wealth of Nations doesn’t have a war to talk about, but the two books do share a relationship in talking about the economy, trade, and money.
wealth_of_nations_bigrams <- wealth_of_nations_clean %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
wealth_of_nations_bigrams
## # A tibble: 351,769 x 3
## book_number chapter_number bigram
## <int> <int> <chr>
## 1 1 0 book i
## 2 1 0 of the
## 3 1 0 the causes
## 4 1 0 causes of
## 5 1 0 of improvement
## 6 1 0 improvement in
## 7 1 0 in the
## 8 1 0 productive powers
## 9 1 0 powers of
## 10 1 0 of labour
## # ... with 351,759 more rows
So my earlier code to remove the NA apparently didn’t work on the whole data frame. I am going to do a step by step for this because I’m not really sure why it didn’t solve the issue. For the word frequency there were well over 2000 cases of NA and I know from how the DF is setup it will occur many times as a bigram which I don’t want.
wealth_of_nations_bigrams <- wealth_of_nations_bigrams %>%
separate(bigram, c("w1", "w2"), sep = " ")
wealth_of_nations_bigrams
## # A tibble: 351,769 x 4
## book_number chapter_number w1 w2
## <int> <int> <chr> <chr>
## 1 1 0 book i
## 2 1 0 of the
## 3 1 0 the causes
## 4 1 0 causes of
## 5 1 0 of improvement
## 6 1 0 improvement in
## 7 1 0 in the
## 8 1 0 productive powers
## 9 1 0 powers of
## 10 1 0 of labour
## # ... with 351,759 more rows
data(stop_words)
wealth_of_nations_bigrams <- wealth_of_nations_bigrams %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word)
wealth_of_nations_bigrams
## # A tibble: 27,209 x 4
## book_number chapter_number w1 w2
## <int> <int> <chr> <chr>
## 1 1 0 productive powers
## 2 1 0 turally distributed
## 3 1 0 <NA> <NA>
## 4 1 0 <NA> <NA>
## 5 1 0 <NA> <NA>
## 6 1 1 <NA> <NA>
## 7 1 1 <NA> <NA>
## 8 1 1 <NA> <NA>
## 9 1 1 productive powers
## 10 1 1 skill dexterity
## # ... with 27,199 more rows
wealth_of_nations_bigrams <- wealth_of_nations_bigrams %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2))
wealth_of_nations_bigrams
## # A tibble: 24,554 x 4
## book_number chapter_number w1 w2
## <int> <int> <chr> <chr>
## 1 1 0 productive powers
## 2 1 0 turally distributed
## 3 1 1 productive powers
## 4 1 1 skill dexterity
## 5 1 1 easily understood
## 6 1 1 carried furthest
## 7 1 1 trifling manufactures
## 8 1 1 single branch
## 9 1 1 trifling nature
## 10 1 1 trifling manufacture
## # ... with 24,544 more rows
wealth_of_nations_bigrams <- wealth_of_nations_bigrams %>%
unite(bigram, w1, w2, sep=" ")
wealth_of_nations_bigrams
## # A tibble: 24,554 x 3
## book_number chapter_number bigram
## <int> <int> <chr>
## 1 1 0 productive powers
## 2 1 0 turally distributed
## 3 1 1 productive powers
## 4 1 1 skill dexterity
## 5 1 1 easily understood
## 6 1 1 carried furthest
## 7 1 1 trifling manufactures
## 8 1 1 single branch
## 9 1 1 trifling nature
## 10 1 1 trifling manufacture
## # ... with 24,544 more rows
Just wondering now if “trifling” is in any of the dictionaries for when I do a sentiment analysis. I hope it is!
bigram_frequencies_won <- wealth_of_nations_bigrams %>%
# Count all the bigrams
count(bigram, sort = TRUE)
bigram_frequencies_won %>%
top_n(15) %>%
mutate(bigram = fct_inorder(bigram)) %>%
ggplot(aes(x = n, y = bigram))+
geom_col() +
labs(y = "Count", x = NULL,
title = "15 most frequent bigrams in Wealth of Nations")
## Selecting by n
consequence_of_peace_bigrams <- consequence_of_peace_clean %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("w1", "w2"), sep = " ") %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word) %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2)) %>%
unite(bigram, w1, w2, sep=" ")
consequence_of_peace_bigrams
## # A tibble: 7,149 x 2
## chapter_number bigram
## <int> <chr>
## 1 0 intensely unusual
## 2 0 unusual unstable
## 3 0 unstable complicated
## 4 0 complicated unreliable
## 5 0 unreliable temporary
## 6 0 temporary nature
## 7 0 economic organization
## 8 0 western europe
## 9 0 half century
## 10 0 late advantages
## # ... with 7,139 more rows
bigram_frequencies_cop <- consequence_of_peace_bigrams %>%
# Count all the bigrams
count(bigram, sort = TRUE)
bigram_frequencies_cop %>%
top_n(15) %>%
mutate(bigram = fct_inorder(bigram)) %>%
ggplot(aes(x = n, y = bigram))+
geom_col() +
labs(y = "Count", x = NULL,
title = "15 most frequent bigrams in Economic Consequence of Peace")
## Selecting by n
I’m thinking that with the Bigrams and Trigrams from Wealth of Nations and Economic Consequences of Peace I can focus in on some particulars that the authors may both find important based on use. How connected are the two economists? Are the same subject from Adam Smiths time still relevant in Keynes’s time?
bigram_counts_won <- wealth_of_nations_bigrams %>% count(bigram, sort=TRUE)
bigram_counts_won
## # A tibble: 14,131 x 2
## bigram n
## <chr> <int>
## 1 annual produce 148
## 2 foreign trade 102
## 3 money price 89
## 4 home market 85
## 5 rude produce 75
## 6 0 0 65
## 7 productive labour 60
## 8 surplus produce 60
## 9 thousand pounds 60
## 10 east indies 56
## # ... with 14,121 more rows
bigram_counts_won <- wealth_of_nations_clean %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("w1", "w2"), sep = " ") %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word) %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2)) %>%
count(w1, w2, sort=TRUE)
bigram_counts_won
## # A tibble: 14,131 x 3
## w1 w2 n
## <chr> <chr> <int>
## 1 annual produce 148
## 2 foreign trade 102
## 3 money price 89
## 4 home market 85
## 5 rude produce 75
## 6 0 0 65
## 7 productive labour 60
## 8 surplus produce 60
## 9 thousand pounds 60
## 10 east indies 56
## # ... with 14,121 more rows
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bigram_graph_won <- bigram_counts_won %>%
filter(n >= 56) %>%
graph_from_data_frame()
bigram_graph_won
## IGRAPH 7ce17ae DN-- 17 10 --
## + attr: name (v/c), n (e/n)
## + edges from 7ce17ae (vertex names):
## [1] annual ->produce foreign ->trade money ->price
## [4] home ->market rude ->produce 0 ->0
## [7] productive->labour surplus ->produce thousand ->pounds
## [10] east ->indies
ggraph(bigram_graph_won, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = grid::arrow(type = "closed", length = unit(2, "mm")),
end_cap = circle(1, "mm")) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), size = 2) +
theme_void()
The above visual is very interesting. I think it depicts the relationship between the words in the bigrams and how they are used together. I will also do this for the other book. For right now it seems as though they cover different subjects. I want to do a little more digging before I do a sentiment analysis on anything. Let’s see how it looks on consequence of peace
bigram_counts_cop <- consequence_of_peace_bigrams %>% count(bigram, sort=TRUE)
bigram_counts_cop <- consequence_of_peace_clean %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("w1", "w2"), sep = " ") %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word) %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2)) %>%
count(w1, w2, sort=TRUE)
bigram_graph_cop <- bigram_counts_cop %>%
filter(n >= 10) %>%
graph_from_data_frame()
bigram_graph_cop
## IGRAPH 7dfe4ab DN-- 37 20 --
## + attr: name (v/c), n (e/n)
## + edges from 7dfe4ab (vertex names):
## [1] reparation->commission pre ->war german ->government
## [4] upper ->silesia prime ->minister united ->kingdom
## [7] alsace ->lorraine austria ->hungary germany's ->capacity
## [10] economic ->life reparation->chapter raw ->materials
## [13] central ->europe german ->nationals allied ->countries
## [16] allied ->governments export ->trade foreign ->securities
## [19] inter ->ally iron ->ore
ggraph(bigram_graph_cop, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = grid::arrow(type = "closed", length = unit(2, "mm")),
end_cap = circle(1, "mm")) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), size = 2) +
theme_void()
wealth_of_nations_trigrams <- wealth_of_nations_clean %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("w1", "w2", "w3"), sep = " ") %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word) %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2)) %>%
filter(!is.na(w3)) %>%
unite(trigram, w1, w2, w3, sep=" ")
wealth_of_nations_trigrams
## # A tibble: 22,125 x 3
## book_number chapter_number trigram
## <int> <int> <chr>
## 1 1 0 productive powers of
## 2 1 0 turally distributed among
## 3 1 1 productive powers of
## 4 1 1 skill dexterity and
## 5 1 1 easily understood by
## 6 1 1 carried furthest in
## 7 1 1 trifling manufactures which
## 8 1 1 single branch though
## 9 1 1 trifling nature the
## 10 1 1 trifling manufacture but
## # ... with 22,115 more rows
trigram_frequencies_won <- wealth_of_nations_trigrams %>%
# Count all the trigrams
count(trigram, sort = TRUE)
trigram_frequencies_won %>%
top_n(15) %>%
mutate(trigram = fct_inorder(trigram)) %>%
ggplot(aes(x = n, y = trigram))+
geom_col() +
labs(y = "Count", x = NULL,
title = "15 most frequent trigrams in Wealth of Nations")
## Selecting by n
consequence_of_peace_trigrams <- consequence_of_peace_clean %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("w1", "w2", "w3"), sep = " ") %>%
filter(!w1 %in% stop_words$word) %>%
filter(!w2 %in% stop_words$word) %>%
filter(!is.na(w1)) %>%
filter(!is.na(w2)) %>%
filter(!is.na(w3)) %>%
unite(trigram, w1, w2, w3, sep=" ")
trigram_frequencies_cop <- consequence_of_peace_trigrams %>%
# Count all the trigrams
count(trigram, sort = TRUE)
trigram_frequencies_cop %>%
top_n(15) %>%
mutate(trigram = fct_inorder(trigram)) %>%
ggplot(aes(x = n, y = trigram))+
geom_col() +
labs(y = "Count", x = NULL,
title = "15 most frequent trigrams in Consequences of the Peace")
## Selecting by n