library(tidytext)
library(textdata)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
library(gutenbergr)
library(widyr)
library(tidygraph)
##
## 다음의 패키지를 부착합니다: 'tidygraph'
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
library(text2vec)
This research is about a comparison between F. Scott Fitzgerald and Mark Twain’s most known two books. For Fitzgerald’s books are ‘Great Gatsby’ and ‘This Side of Paradise’, and for Mark Twain ‘The Adventures of Tom Sawyer’ and ‘Adventures of Huckleberry Finn’. The report will be comparing the two authors’ books by their sentiment and the authors’ writing styles to answer ‘which author writes in a more positive sentiment, what kinds of terms connect to each other throughout the books, and which words do both authors choose to use the most in their stories’. In the process, sentiment analysis will be done using the bing lexicon. Also, the tf-idf and log odds ratio will be analyzed. Finally, the analysis of bigrams will be done. After the analysis of these, the writing styles of the authors are going to be clearer.
The data comes from the gutenbergr package. The “gutenbergr” package is an R package that provides access to a vast collection of public domain books from Project Gutenberg. Project Gutenberg is a digital library that offers free access to a wide range of books in various languages. This package allows users to search, download, and analyze books directly from the Project Gutenberg repository within the R environment. It provides a convenient interface to access book metadata, retrieve the full text of books, and perform various text analysis tasks. Project Gutenberg website(https://www.gutenberg.org/). The data is structured as a usual book. Overall, the package provides a convenient way to access and analyze a vast collection of public domain books from Project Gutenberg within the R environment. For this analysis, F. Scott Fitzgerald and Mark Twain’s books were downloaded from the package.
Data was cleaned and reshaped throughout the process of tokenization. In R, tokenizing words is possible using the ‘tidytext’ package which provides functions for text mining and text analysis. Tokenization can be used with the unnest_tokens function that the tidytext package provides. This function splits the text into individual words or tokens. Then, using the anti_join function, stop words can be removed.
#Load the data for all 4 books.
toms <- gutenberg_download(gutenberg_id = 74)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
huckle <- gutenberg_download(gutenberg_id = 76)
greatg <- gutenberg_download(64317)
sideofp <- gutenberg_download(805)
Also, linenumber and document columns were added. Linenumber will later on be used for sentiment analysis and document for tf-idf. Line number assigns a line for each word and document categorizes the book by their title.
#Tokenization and removing stop words. Add them linenumber for sentiment analysis and document for tf-idf.
toms_tidy <- toms %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(document = "The Adventures of Tom Sawyer")
## Joining with `by = join_by(word)`
toms_tidy
## # A tibble: 26,509 × 4
## gutenberg_id word linenumber document
## <int> <chr> <int> <chr>
## 1 74 adventures 1 The Adventures of Tom Sawyer
## 2 74 tom 2 The Adventures of Tom Sawyer
## 3 74 sawyer 3 The Adventures of Tom Sawyer
## 4 74 mark 4 The Adventures of Tom Sawyer
## 5 74 twain 5 The Adventures of Tom Sawyer
## 6 74 samuel 6 The Adventures of Tom Sawyer
## 7 74 langhorne 7 The Adventures of Tom Sawyer
## 8 74 clemens 8 The Adventures of Tom Sawyer
## 9 74 contents 9 The Adventures of Tom Sawyer
## 10 74 chapter 10 The Adventures of Tom Sawyer
## # ℹ 26,499 more rows
huckle_tidy <- huckle %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(document = "Adventures of Huckleberry Finn")
## Joining with `by = join_by(word)`
greatg_tidy <- greatg %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(document = "Great Gatsby")
## Joining with `by = join_by(word)`
sideofp_tidy <- sideofp %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(linenumber = row_number()) %>%
mutate(document = "This Side of Paradise")
## Joining with `by = join_by(word)`
Next, bing sentiments were added for the sentiment analysis. Inner join function was used for the addition of bing sentiments to every book.
#Apply bing sentiments to data.
bing_sentiments <- get_sentiments("bing")
bing_toms <- toms_tidy %>%
inner_join(bing_sentiments, by = "word")
bing_huckle <- huckle_tidy %>%
inner_join(bing_sentiments, by = "word")
bing_greatg <- greatg_tidy %>%
inner_join(bing_sentiments, by = "word")
bing_sideofp <- sideofp_tidy %>%
inner_join(bing_sentiments, by = "word")
Data pre-processing for bigrams was also necessary. After tokenizing the books, the separate function is used to divide the bigrams into word 1 and word2. This is necessary in order to remove the stop words. After removing the stop words, the bigrams were once again united. This process was done for every book.
toms_bigram <- toms %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
toms_separated <- toms_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
toms_filtered <- toms_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
toms_united <- toms_filtered %>%
unite(bigram, word1, word2, sep = " ")
huckle_bigram <- huckle %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
huckle_separated <- huckle_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
huckle_filtered <- huckle_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
huckle_united <- huckle_filtered %>%
unite(bigram, word1, word2, sep = " ")
greatg_bigram <- greatg%>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
greatg_separated <- greatg_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
greatg_filtered <- greatg_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
greatg_united <- greatg_filtered %>%
unite(bigram, word1, word2, sep = " ")
sideofp_bigram <- sideofp %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram))
sideofp_separated <- sideofp_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
sideofp_filtered <- sideofp_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
sideofp_united <- sideofp_filtered %>%
unite(bigram, word1, word2, sep = " ")
First, the top 10 most frequent positive and negative words were found using the bing lexicon. This shows which words are the most frequently used positive and negative words. By analyzing this, the writing style of the authors can be seen and compared. It can be seen that Mark Twain’s negative sentiment words have some words in common like dead or awful. This also happens in the positive sentiments words. Twain uses words like mighty or pretty. The negative word sentiments for Fitzgerald are also like Twains, but you can see a difference in the positive words. Even though they seem to have some common words, the most frequent words don’t show everything about a writers style.
#Find top frequent words in each dataset.
freq_toms <- bing_toms %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
slice_max(n, n = 10)
freq_toms
## # A tibble: 20 × 3
## # Groups: sentiment [2]
## word sentiment n
## <chr> <chr> <int>
## 1 dead negative 68
## 2 poor negative 56
## 3 cave negative 47
## 4 awful negative 41
## 5 fell negative 41
## 6 lost negative 35
## 7 trouble negative 32
## 8 dark negative 29
## 9 knife negative 27
## 10 hard negative 24
## 11 mighty positive 26
## 12 treasure positive 26
## 13 master positive 22
## 14 glad positive 21
## 15 ready positive 19
## 16 fine positive 18
## 17 pretty positive 18
## 18 happy positive 17
## 19 strong positive 14
## 20 nice positive 13
freq_huckle <- bing_huckle %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
slice_max(n, n = 10)
freq_huckle
## # A tibble: 22 × 3
## # Groups: sentiment [2]
## word sentiment n
## <chr> <chr> <int>
## 1 trouble negative 84
## 2 dead negative 76
## 3 miss negative 76
## 4 dark negative 70
## 5 poor negative 51
## 6 struck negative 50
## 7 bad negative 41
## 8 blame negative 39
## 9 fool negative 38
## 10 awful negative 35
## # ℹ 12 more rows
freq_greatg <- bing_greatg %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
slice_max(n, n = 10)
freq_greatg
## # A tibble: 21 × 3
## # Groups: sentiment [2]
## word sentiment n
## <chr> <chr> <int>
## 1 miss negative 38
## 2 dark negative 25
## 3 broke negative 23
## 4 hard negative 19
## 5 slowly negative 18
## 6 fell negative 17
## 7 cold negative 14
## 8 crazy negative 14
## 9 pale negative 14
## 10 dead negative 12
## # ℹ 11 more rows
freq_sideofp <- bing_sideofp %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
slice_max(n, n = 10)
freq_sideofp
## # A tibble: 20 × 3
## # Groups: sentiment [2]
## word sentiment n
## <chr> <chr> <int>
## 1 dark negative 48
## 2 poor negative 35
## 3 afraid negative 34
## 4 damn negative 28
## 5 dead negative 26
## 6 hard negative 26
## 7 lost negative 26
## 8 slowly negative 26
## 9 tired negative 25
## 10 fell negative 23
## 11 love positive 91
## 12 beauty positive 31
## 13 beautiful positive 26
## 14 golden positive 24
## 15 pretty positive 22
## 16 romantic positive 21
## 17 strong positive 20
## 18 wonderful positive 20
## 19 happy positive 19
## 20 brilliant positive 18
Next, the positive and negative words in 100 lines were analyzed. Using the pivot wider function, the sentiment of the story by 100 lines could be found.
#Find the sentiment of the story by 100 lines.
definedtoms <- toms_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0)
## Joining with `by = join_by(word)`
definedhuckle <- huckle_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0)
## Joining with `by = join_by(word)`
definedgreatg <- greatg_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0)
## Joining with `by = join_by(word)`
definedsideofp <- sideofp_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0)
## Joining with `by = join_by(word)`
For the visualization of the book’s plot sentiment, it was important to quantify the sentiment because in order to visualize it as a graph, numbers are necessary.
#Visualization of the story's plot sentiment.
toms_sentiment <- toms_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
huckle_sentiment <- huckle_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
greatg_sentiment <- greatg_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
sideofp_sentiment <- sideofp_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(index = linenumber %/% 100, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
A bigram is a sequence of two adjacent words or tokens in a text. These can be used to capture patterns and relationships between words in a text.
pair_tomsbi <- toms_united %>%
count(bigram, sort = TRUE) %>%
filter(n >= 5) %>%
na.omit()
pair_tomsbi
## # A tibble: 44 × 2
## bigram n
## <chr> <int>
## 1 injun joe 47
## 2 aunt polly 46
## 3 tom sawyer 25
## 4 muff potter 20
## 5 injun joe’s 18
## 6 sunday school 14
## 7 widow douglas 14
## 8 joe harper 13
## 9 becky thatcher 12
## 10 aunt polly’s 10
## # ℹ 34 more rows
pair_hucklebi <- huckle_united %>%
count(bigram, sort = TRUE) %>%
filter(n >= 7) %>%
na.omit()
pair_hucklebi
## # A tibble: 40 × 2
## bigram n
## <chr> <int>
## 1 mary jane 42
## 2 tom sawyer 40
## 3 aunt sally 38
## 4 miss watson 21
## 5 tow head 20
## 6 miss mary 19
## 7 uncle silas 18
## 8 runaway nigger 17
## 9 mars tom 15
## 10 bad luck 14
## # ℹ 30 more rows
pair_greatgbi <- greatg_united %>%
count(bigram, sort = TRUE) %>%
filter(n >= 4) %>%
na.omit()
pair_greatgbi
## # A tibble: 24 × 2
## bigram n
## <chr> <int>
## 1 miss baker 23
## 2 west egg 23
## 3 tom buchanan 14
## 4 jordan baker 12
## 5 gatsby’s house 10
## 6 demanded tom 8
## 7 front door 8
## 8 jay gatsby 8
## 9 shook hands 8
## 10 cried daisy 6
## # ℹ 14 more rows
pair_sidebi <- sideofp_united %>%
count(bigram, sort = TRUE) %>%
filter(n >= 5) %>%
na.omit()
pair_sidebi
## # A tibble: 43 × 2
## bigram n
## <chr> <int>
## 1 st regis 16
## 2 amory blaine 15
## 3 monsignor darcy 12
## 4 amory considered 11
## 5 lake geneva 10
## 6 amory looked 9
## 7 burne holiday 9
## 8 dear boy 9
## 9 note book 9
## 10 amory found 8
## # ℹ 33 more rows
Creating network graph data is also essential. In order to create a visual network graph the network graph data has to be ready.
#Creating Network Graph Data
graph_bigramtoms <- pair_tomsbi %>%
as_tbl_graph(directed = FALSE) %>%
mutate(centrality = centrality_degree(),
group = as.factor(group_infomap()))
graph_bigramtoms
## # A tbl_graph: 58 nodes and 44 edges
## #
## # An unrooted forest with 14 trees
## #
## # A tibble: 58 × 3
## name centrality group
## <chr> <dbl> <fct>
## 1 injun joe 1 7
## 2 aunt polly 1 8
## 3 tom sawyer 1 9
## 4 muff potter 1 10
## 5 injun joe’s 1 11
## 6 sunday school 1 4
## # ℹ 52 more rows
## #
## # A tibble: 44 × 2
## from to
## <int> <int>
## 1 1 45
## 2 2 46
## 3 3 47
## # ℹ 41 more rows
graph_bigramhuckle <- pair_hucklebi %>%
as_tbl_graph(directed = FALSE) %>%
mutate(centrality = centrality_degree(),
group = as.factor(group_infomap()))
graph_bigramhuckle
## # A tbl_graph: 57 nodes and 40 edges
## #
## # An unrooted forest with 17 trees
## #
## # A tibble: 57 × 3
## name centrality group
## <chr> <dbl> <fct>
## 1 mary jane 1 8
## 2 tom sawyer 1 9
## 3 aunt sally 1 10
## 4 miss watson 1 11
## 5 tow head 1 12
## 6 miss mary 1 13
## # ℹ 51 more rows
## #
## # A tibble: 40 × 2
## from to
## <int> <int>
## 1 1 41
## 2 2 42
## 3 3 43
## # ℹ 37 more rows
graph_bigramgreatg <- pair_greatgbi %>%
as_tbl_graph(directed = FALSE) %>%
mutate(centrality = centrality_degree(),
group = as.factor(group_infomap()))
graph_bigramgreatg
## # A tbl_graph: 32 nodes and 24 edges
## #
## # An unrooted forest with 8 trees
## #
## # A tibble: 32 × 3
## name centrality group
## <chr> <dbl> <fct>
## 1 miss baker 1 4
## 2 west egg 1 4
## 3 tom buchanan 1 6
## 4 jordan baker 1 7
## 5 gatsby’s house 1 8
## 6 demanded tom 1 2
## # ℹ 26 more rows
## #
## # A tibble: 24 × 2
## from to
## <int> <int>
## 1 1 25
## 2 2 25
## 3 3 26
## # ℹ 21 more rows
graph_bigramside <- pair_sidebi %>%
as_tbl_graph(directed = FALSE) %>%
mutate(centrality = centrality_degree(),
group = as.factor(group_infomap()))
graph_bigramside
## # A tbl_graph: 53 nodes and 43 edges
## #
## # An unrooted forest with 10 trees
## #
## # A tibble: 53 × 3
## name centrality group
## <chr> <dbl> <fct>
## 1 st regis 1 6
## 2 amory blaine 1 7
## 3 monsignor darcy 1 8
## 4 amory considered 1 9
## 5 lake geneva 1 10
## 6 amory looked 1 4
## # ℹ 47 more rows
## #
## # A tibble: 43 × 2
## from to
## <int> <int>
## 1 1 44
## 2 2 45
## 3 3 46
## # ℹ 40 more rows
TF-IDF stands for Term Frequency-Inverse Document Frequency. It is a numerical statistic used to evaluate the importance of a term in a document within a collection or corpus. In this case, the books. The higher the TF-IDF value of a term in a document, the more important or relevant it is to that document compared to the rest of the corpus. It helps in identifying significant terms or keywords that can represent the content of a document or distinguish it from other documents in the collection.
toms_tidy <- toms_tidy %>% mutate(document = "The Adventures of Tom Sawyer")
huckle_tidy <- huckle_tidy %>% mutate(document = "Adventures of Huckleberry Finn")
greatg_tidy <- greatg_tidy %>% mutate(document = "Great Gatsby")
sideofp_tidy <- sideofp_tidy %>% mutate(document = "This Side of Paradise")
toms_tidy
## # A tibble: 26,509 × 4
## gutenberg_id word linenumber document
## <int> <chr> <int> <chr>
## 1 74 adventures 1 The Adventures of Tom Sawyer
## 2 74 tom 2 The Adventures of Tom Sawyer
## 3 74 sawyer 3 The Adventures of Tom Sawyer
## 4 74 mark 4 The Adventures of Tom Sawyer
## 5 74 twain 5 The Adventures of Tom Sawyer
## 6 74 samuel 6 The Adventures of Tom Sawyer
## 7 74 langhorne 7 The Adventures of Tom Sawyer
## 8 74 clemens 8 The Adventures of Tom Sawyer
## 9 74 contents 9 The Adventures of Tom Sawyer
## 10 74 chapter 10 The Adventures of Tom Sawyer
## # ℹ 26,499 more rows
toptomstidy <- toms_tidy %>%
count(document, word, sort = TRUE)
tophuckletidy <- huckle_tidy %>%
count(document, word, sort = TRUE)
topgreatgtidy <- greatg_tidy %>%
count(document, word, sort = TRUE)
topsidetidy <- sideofp_tidy %>%
count(document, word, sort = TRUE)
data <- bind_rows(toptomstidy, tophuckletidy, topgreatgtidy, topsidetidy)
data
## # A tibble: 27,882 × 3
## document word n
## <chr> <chr> <int>
## 1 The Adventures of Tom Sawyer tom 722
## 2 The Adventures of Tom Sawyer huck 232
## 3 The Adventures of Tom Sawyer don’t 218
## 4 The Adventures of Tom Sawyer time 191
## 5 The Adventures of Tom Sawyer it’s 160
## 6 The Adventures of Tom Sawyer boys 158
## 7 The Adventures of Tom Sawyer joe 138
## 8 The Adventures of Tom Sawyer boy 122
## 9 The Adventures of Tom Sawyer ain’t 120
## 10 The Adventures of Tom Sawyer that’s 113
## # ℹ 27,872 more rows
tfidf <- data %>%
bind_tf_idf(word, document, n) %>%
arrange(-tf_idf)
tfidf
## # A tibble: 27,882 × 6
## document word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 This Side of Paradise amory 838 0.0272 1.39 0.0377
## 2 Great Gatsby gatsby 197 0.0111 1.39 0.0154
## 3 Great Gatsby daisy 150 0.00848 1.39 0.0117
## 4 Adventures of Huckleberry Finn en 235 0.00650 1.39 0.00901
## 5 This Side of Paradise rosalind 189 0.00613 1.39 0.00850
## 6 The Adventures of Tom Sawyer huck 232 0.00875 0.693 0.00607
## 7 Adventures of Huckleberry Finn warn’t 290 0.00802 0.693 0.00556
## 8 The Adventures of Tom Sawyer becky 102 0.00385 1.39 0.00533
## 9 Great Gatsby gatsby’s 67 0.00379 1.39 0.00525
## 10 Great Gatsby jordan 64 0.00362 1.39 0.00501
## # ℹ 27,872 more rows
top_tfidf <- tfidf %>%
arrange(tf_idf) %>%
group_by(document) %>%
slice_max(tf_idf, n = 10, with_ties = F)
top_tfidf
## # A tibble: 40 × 6
## # Groups: document [4]
## document word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Adventures of Huckleberry Finn en 235 0.00650 1.39 0.00901
## 2 Adventures of Huckleberry Finn warn’t 290 0.00802 0.693 0.00556
## 3 Adventures of Huckleberry Finn nigger 157 0.00434 0.693 0.00301
## 4 Adventures of Huckleberry Finn jim 349 0.00965 0.288 0.00278
## 5 Adventures of Huckleberry Finn duke 144 0.00398 0.693 0.00276
## 6 Adventures of Huckleberry Finn didn’t 344 0.00951 0.288 0.00274
## 7 Adventures of Huckleberry Finn dat 70 0.00194 1.39 0.00268
## 8 Adventures of Huckleberry Finn don’t 335 0.00926 0.288 0.00266
## 9 Adventures of Huckleberry Finn canoe 64 0.00177 1.39 0.00245
## 10 Adventures of Huckleberry Finn ain’t 290 0.00802 0.288 0.00231
## # ℹ 30 more rows
filtered_data <- tfidf %>%
filter(document %in% c("The Adventures of Tom Sawyer", "Adventures of Huckleberry Finn", "Great Gatsby", "This Side of Paradise"))
author_data <- filtered_data %>%
mutate(author = case_when(
document %in% c("The Adventures of Tom Sawyer", "Adventures of Huckleberry Finn") ~ "Mark Twain",
document %in% c("Great Gatsby", "This Side of Paradise") ~ "F. Scott Fitzgerald"
))
frequency_wide <- author_data %>%
pivot_wider(names_from = author,
values_from = n,
values_fill = list(n = 0)) %>%
rename(Twain = `Mark Twain`, Fitzgerald = `F. Scott Fitzgerald`)
frequency_wide <- frequency_wide %>%
mutate(log_odds_ratio = log(((Twain + 1) / (sum(Twain + 1))) /
((Fitzgerald + 1) / (sum(Fitzgerald + 1)))))
frequency_wide
## # A tibble: 27,882 × 8
## document word tf idf tf_idf Fitzgerald Twain log_odds_ratio
## <chr> <chr> <dbl> <dbl> <dbl> <int> <int> <dbl>
## 1 This Side of Par… amory 0.0272 1.39 0.0377 838 0 -6.90
## 2 Great Gatsby gats… 0.0111 1.39 0.0154 197 0 -5.46
## 3 Great Gatsby daisy 0.00848 1.39 0.0117 150 0 -5.19
## 4 Adventures of Hu… en 0.00650 1.39 0.00901 0 235 5.29
## 5 This Side of Par… rosa… 0.00613 1.39 0.00850 189 0 -5.42
## 6 The Adventures o… huck 0.00875 0.693 0.00607 0 232 5.28
## 7 Adventures of Hu… warn… 0.00802 0.693 0.00556 0 290 5.50
## 8 The Adventures o… becky 0.00385 1.39 0.00533 0 102 4.47
## 9 Great Gatsby gats… 0.00379 1.39 0.00525 67 0 -4.39
## 10 Great Gatsby jord… 0.00362 1.39 0.00501 64 0 -4.34
## # ℹ 27,872 more rows
top10lor <- frequency_wide %>%
group_by(author = ifelse(log_odds_ratio > 0, "Twain", "Fitzgerald")) %>%
slice_max(abs(log_odds_ratio), n = 10, with_ties = FALSE)
top10lor
## # A tibble: 20 × 9
## # Groups: author [2]
## document word tf idf tf_idf Fitzgerald Twain log_odds_ratio author
## <chr> <chr> <dbl> <dbl> <dbl> <int> <int> <dbl> <chr>
## 1 This Side… amory 0.0272 1.39 0.0377 838 0 -6.90 Fitzg…
## 2 Great Gat… gats… 0.0111 1.39 0.0154 197 0 -5.46 Fitzg…
## 3 This Side… rosa… 0.00613 1.39 0.00850 189 0 -5.42 Fitzg…
## 4 Great Gat… tom 0.0100 0 0 177 0 -5.35 Fitzg…
## 5 Great Gat… daisy 0.00848 1.39 0.0117 150 0 -5.19 Fitzg…
## 6 This Side… night 0.00467 0 0 144 0 -5.15 Fitzg…
## 7 This Side… peop… 0.00412 0 0 127 0 -5.02 Fitzg…
## 8 This Side… life 0.00373 0 0 115 0 -4.92 Fitzg…
## 9 This Side… eyes 0.00363 0 0 112 0 -4.90 Fitzg…
## 10 This Side… day 0.00331 0 0 102 0 -4.80 Fitzg…
## 11 The Adven… tom 0.0272 0 0 0 722 6.41 Twain
## 12 Adventure… jim 0.00965 0.288 0.00278 0 349 5.69 Twain
## 13 Adventure… didn… 0.00951 0.288 0.00274 0 344 5.67 Twain
## 14 Adventure… don’t 0.00926 0.288 0.00266 0 335 5.65 Twain
## 15 Adventure… time 0.00899 0 0 0 325 5.62 Twain
## 16 Adventure… warn… 0.00802 0.693 0.00556 0 290 5.50 Twain
## 17 Adventure… ain’t 0.00802 0.288 0.00231 0 290 5.50 Twain
## 18 Adventure… de 0.00694 0 0 0 251 5.36 Twain
## 19 Adventure… en 0.00650 1.39 0.00901 0 235 5.29 Twain
## 20 The Adven… huck 0.00875 0.693 0.00607 0 232 5.28 Twain
For the visualization of the book’s plot sentiment, it was important to quantify the sentiment because in order to visualize it as a graph, numbers are necessary. As mentioned before, it is essential to quantify the sentiment scores. This helps us visualize the data as a graph. As you can see in the figures below, all four of the books have more of a negative sentiment throughout the book. This is also affected by the bing lexicon because the bing lexicon contains more negative words than positive words. It can be seen that both authors’ writing styles vary in their sentiments. Both have one book that ends in a happy ending and another book that ends in a negative sentiment ending. Also, throughout the Tom Sawyer there is less positive sentiment than Huckleberry Finn. The same goes for Fitzgerald. The Great Gatsby has more negative sentiment indexes in comparison to This Side of Paradise.
#Figures 1-1~1-4
ggplot(toms_sentiment, aes(index, sentiment, fill = "The Adventures of Tom Sawyer")) +
geom_col(show.legend = FALSE) +
ggtitle("The Adventures of Tom Sawyer Sentiment Flow") +
labs(subtitle = "Figure 1-1")
ggplot(huckle_sentiment, aes(index, sentiment, fill = "Adventures of Huckleberry Finn")) +
geom_col(show.legend = FALSE) +
ggtitle("Adventures of Huckleberry Finn Sentiment Flow") +
labs(subtitle = "Figure 1-2")
ggplot(greatg_sentiment, aes(index, sentiment, fill = "The Great Gatsby")) +
geom_col(show.legend = FALSE) +
ggtitle("The Great Gatsby Sentiment Flow") +
labs(subtitle = "Figure 1-3")
ggplot(sideofp_sentiment, aes(index, sentiment, fill = "This Side of Paradise")) +
geom_col(show.legend = FALSE) +
ggtitle("This Side of Paradise Sentiment Flow") +
labs(subtitle = "Figure 1-4")
It can be seen in the figures below that most of the top bigrams repeated are names. There are names of people, a name of a lake. etc. This shows how both authors emphasize their characters. Also, lots of words connect to their main characters. For showing results of bigrams, it is best to use network graphs. These can show clearly which words are connected to which and the node colors are separated by color so there is no confusion. Also, the nodes sizes differ in relationship to their frequency. This shows which nodes and words appear more in texts and which words have more connection to others.
set.seed(1234)
ggraph(graph_bigramtoms, layout = "fr") +
geom_edge_link(color = "gray50", alpha = 5) +
geom_node_point(aes(size = centrality, color = group), show.legend = FALSE) +
scale_size(range = c(5, 10)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3.5) +
theme_graph(base_family = "Arial") +
labs(title = "Bigrams in The Adventures of Tom Sawyer", subtitle = "Figure 2-1" )
ggraph(graph_bigramhuckle, layout = "fr") +
geom_edge_link(color = "gray50", alpha = 0.5) +
geom_node_point(aes(size = centrality, color = group), show.legend = FALSE) +
scale_size(range = c(5, 10)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3.5) +
theme_graph(base_family = "Arial") +
labs(title = "Bigrams in Adventures of Huckleberry Finn", subtitle = "Figure 2-2" )
ggraph(graph_bigramgreatg, layout = "fr") +
geom_edge_link(color = "gray50", alpha = 0.5) +
geom_node_point(aes(size = centrality, color = group), show.legend = FALSE) +
scale_size(range = c(5, 10)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3.5) +
theme_graph(base_family = "Arial") +
labs(title = "Bigrams in Great Gatsby", subtitle = "Figure 2-3" )
ggraph(graph_bigramside, layout = "fr") +
geom_edge_link(color = "gray50", alpha = 0.5) +
geom_node_point(aes(size = centrality, color = group), show.legend = FALSE) +
scale_size(range = c(5, 10)) +
geom_node_text(aes(label = name), repel = TRUE, size = 3.5) +
theme_graph(base_family = "Arial") +
labs(title = "Bigrams in This Side of Paradise", subtitle = "Figure 2-4" )
Showing the tf-idf in bar graphs is a clearer way to see the results. The bars are colored by books and have labels. It can be seen clearly which words had the highest tf-idf in each book.
ggplot(top_tfidf, aes(x = reorder_within(word, tf_idf, document),
y = tf_idf,
fill = document)) +
geom_col(show.legend = F) +
coord_flip() +
facet_wrap(~ document, scales = "free", ncol = 2) +
scale_x_reordered() +
labs(x = NULL, title = "TF_IDF in each Book", subtitle = "Figure 3-1") +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
### Log Odds Ratio Figure 3-2
The log odds ratios also show lots of names of the characters. In Twain’s books the characters’ names that appear three times and for Fitzgerald four times. Aside from the characters, the words Twain uses are not common words. These words are not common words to use in a book because they write for the southern Missouri accent, which is the accent Huckleberry and Tom Sawyer used. Some examples are ain’t, en, and de. When comparing the log odds ratio for two authors, a bar graph is the simplest and clearest way to see the results. The bars are colored by the authors’ names and have labels, so there is no confusion.
ggplot(top10lor, aes(x = reorder_within(word, log_odds_ratio, author),
y = log_odds_ratio,
fill = author)) +
geom_col(show.legend = F) +
coord_flip() +
facet_wrap(~author, scales = "free", ncol = 2) +
scale_x_reordered() +
labs(x = NULL, y = "Log Odds Ratio", title = "Log Odds Ratio for Authors", subtitle = "Figure 3-2")
In conclusion, it can be said that both authors emphasize their characters with a high frequency. This is due to lots of reasons, but it is normal to write your main characters’ name frequently. On the other hand, it could be seen as a skill that both authors use. There were also differences between the authors. It could be seen that Mark Twain used lots of unfamiliar words for most of the people. Because these two books take place in the same place: Missouri. This could be seen as a skill Mark Twain used to make things new in his writing and attract more readers. On the contrary, there were lots of familiar words in Fitzgerald’s books, which is totally common. By the sentiment analysis it could be seen that both writers have differences and similarities in sentiments throughout the plot. It showed how writers use different sentiments throughout different books, and that not all books have happy endings.