OBJECTIVE: Perform some sort of sentiment analysis on a group of
text. I found a list of titles to run analysis on. Curious to see if I
can associate a sentiment score to the titling of a specific
author.
Load packages and create unnested data set
# Load packages
library(gutenbergr)
library(dplyr)
library(tidytext)
# Load Gutenberg metadata
gute <- gutenberg_metadata
# Tokenize titles
gute_unnest <- gute |> unnest_tokens(word, title)
head(gute_unnest, 20)
## # A tibble: 20 × 8
## gutenberg_id author gutenberg_author_id language gutenberg_bookshelf rights
## <int> <chr> <int> <chr> <chr> <chr>
## 1 1 Jeffers… 1638 en Politics/American … Publi…
## 2 1 Jeffers… 1638 en Politics/American … Publi…
## 3 1 Jeffers… 1638 en Politics/American … Publi…
## 4 1 Jeffers… 1638 en Politics/American … Publi…
## 5 1 Jeffers… 1638 en Politics/American … Publi…
## 6 1 Jeffers… 1638 en Politics/American … Publi…
## 7 1 Jeffers… 1638 en Politics/American … Publi…
## 8 1 Jeffers… 1638 en Politics/American … Publi…
## 9 1 Jeffers… 1638 en Politics/American … Publi…
## 10 1 Jeffers… 1638 en Politics/American … Publi…
## 11 2 United … 1 en Politics/American … Publi…
## 12 2 United … 1 en Politics/American … Publi…
## 13 2 United … 1 en Politics/American … Publi…
## 14 2 United … 1 en Politics/American … Publi…
## 15 2 United … 1 en Politics/American … Publi…
## 16 2 United … 1 en Politics/American … Publi…
## 17 2 United … 1 en Politics/American … Publi…
## 18 2 United … 1 en Politics/American … Publi…
## 19 2 United … 1 en Politics/American … Publi…
## 20 2 United … 1 en Politics/American … Publi…
## # ℹ 2 more variables: has_text <lgl>, word <chr>
Remove stop words and join with ‘bing’ sentiment lexicon
# Remove stop words
gute_clean <- gute_unnest |> anti_join(stop_words)
head(gute_clean, 20)
## # A tibble: 20 × 8
## gutenberg_id author gutenberg_author_id language gutenberg_bookshelf rights
## <int> <chr> <int> <chr> <chr> <chr>
## 1 1 Jeffers… 1638 en "Politics/American… Publi…
## 2 1 Jeffers… 1638 en "Politics/American… Publi…
## 3 1 Jeffers… 1638 en "Politics/American… Publi…
## 4 1 Jeffers… 1638 en "Politics/American… Publi…
## 5 2 United … 1 en "Politics/American… Publi…
## 6 2 United … 1 en "Politics/American… Publi…
## 7 2 United … 1 en "Politics/American… Publi…
## 8 2 United … 1 en "Politics/American… Publi…
## 9 2 United … 1 en "Politics/American… Publi…
## 10 2 United … 1 en "Politics/American… Publi…
## 11 2 United … 1 en "Politics/American… Publi…
## 12 2 United … 1 en "Politics/American… Publi…
## 13 3 Kennedy… 1666 en "" Publi…
## 14 3 Kennedy… 1666 en "" Publi…
## 15 3 Kennedy… 1666 en "" Publi…
## 16 3 Kennedy… 1666 en "" Publi…
## 17 4 Lincoln… 3 en "US Civil War" Publi…
## 18 4 Lincoln… 3 en "US Civil War" Publi…
## 19 4 Lincoln… 3 en "US Civil War" Publi…
## 20 4 Lincoln… 3 en "US Civil War" Publi…
## # ℹ 2 more variables: has_text <lgl>, word <chr>
# Join with sentiment lexicon
sentiment_join <- gute_clean |> inner_join(get_sentiments('bing'))
head(sentiment_join, 20)
## # A tibble: 20 × 9
## gutenberg_id author gutenberg_author_id language gutenberg_bookshelf rights
## <int> <chr> <int> <chr> <chr> <chr>
## 1 6 Henry, … 4 en "American Revoluti… Publi…
## 2 6 Henry, … 4 en "American Revoluti… Publi…
## 3 7 <NA> NA en "" Publi…
## 4 13 Carroll… 7 en "Children's Litera… Publi…
## 5 15 Melvill… 9 en "Best Books Ever L… Publi…
## 6 16 Barrie,… 10 en "Children's Litera… Publi…
## 7 20 Milton,… 17 en "Poetry/Banned Boo… Publi…
## 8 20 Milton,… 17 en "Poetry/Banned Boo… Publi…
## 9 23 Douglas… 34510 en "African American … Publi…
## 10 26 Milton,… 17 en "" Publi…
## 11 26 Milton,… 17 en "" Publi…
## 12 41 Irving,… 34 en "Children's Litera… Publi…
## 13 42 Stevens… 35 en "Precursors of Sci… Publi…
## 14 43 Stevens… 35 en "Precursors of Sci… Publi…
## 15 54 Baum, L… 42 en "Children's Litera… Publi…
## 16 55 Baum, L… 42 en "Children's Litera… Publi…
## 17 56 Polly, … 38 en "" Copyr…
## 18 57 Unknown 49 en "" Publi…
## 19 58 Milton,… 17 en "" Publi…
## 20 59 Descart… 44 en "Harvard Classics/… Publi…
## # ℹ 3 more variables: has_text <lgl>, word <chr>, sentiment <chr>
See how many distinct sentiments there are
# Count distinct sentiments
n_distinct(sentiment_join$sentiment)
## [1] 2
Assign numerical scores to positive and negative sentiments
# Assign sentiment scores
sentiment_w_score <- sentiment_join %>%
mutate(sentiment_score = if_else(sentiment == 'positive', 1, -1))
head(sentiment_w_score, 20)
## # A tibble: 20 × 10
## gutenberg_id author gutenberg_author_id language gutenberg_bookshelf rights
## <int> <chr> <int> <chr> <chr> <chr>
## 1 6 Henry, … 4 en "American Revoluti… Publi…
## 2 6 Henry, … 4 en "American Revoluti… Publi…
## 3 7 <NA> NA en "" Publi…
## 4 13 Carroll… 7 en "Children's Litera… Publi…
## 5 15 Melvill… 9 en "Best Books Ever L… Publi…
## 6 16 Barrie,… 10 en "Children's Litera… Publi…
## 7 20 Milton,… 17 en "Poetry/Banned Boo… Publi…
## 8 20 Milton,… 17 en "Poetry/Banned Boo… Publi…
## 9 23 Douglas… 34510 en "African American … Publi…
## 10 26 Milton,… 17 en "" Publi…
## 11 26 Milton,… 17 en "" Publi…
## 12 41 Irving,… 34 en "Children's Litera… Publi…
## 13 42 Stevens… 35 en "Precursors of Sci… Publi…
## 14 43 Stevens… 35 en "Precursors of Sci… Publi…
## 15 54 Baum, L… 42 en "Children's Litera… Publi…
## 16 55 Baum, L… 42 en "Children's Litera… Publi…
## 17 56 Polly, … 38 en "" Copyr…
## 18 57 Unknown 49 en "" Publi…
## 19 58 Milton,… 17 en "" Publi…
## 20 59 Descart… 44 en "Harvard Classics/… Publi…
## # ℹ 4 more variables: has_text <lgl>, word <chr>, sentiment <chr>,
## # sentiment_score <dbl>
Remove nulls and create mean sentiment scores by author
# Remove NAs
sentiment_w_score <- na.omit(sentiment_w_score)
# Calculate average sentiment score by author
mean_score_by_author <- sentiment_w_score |> group_by(author) |>
summarize(avg_sentiment_score = mean(sentiment_score), count = n())
head(mean_score_by_author, 20)
## # A tibble: 20 × 3
## author avg_sentiment_score count
## <chr> <dbl> <int>
## 1 A British officer -1 1
## 2 A Californian -1 1
## 3 A-No. 1 -0.333 3
## 4 A. L. O. E. 0.6 5
## 5 A.L.O.C. -1 3
## 6 Aaron, S. F. (Samuel Francis) 1 1
## 7 Abbot, Francis Ellingwood 1 1
## 8 Abbot, Robert, cook 1 2
## 9 Abbott, Charles C. (Charles Conrad) 0 2
## 10 Abbott, Edwin Abbott 1 1
## 11 Abbott, Eleanor Hallowell -0.111 9
## 12 Abbott, Henry -1 1
## 13 Abbott, Jacob 0.429 7
## 14 Abbott, Jane 1 1
## 15 Abbott, John S. C. (John Stevens Cabot) -0.333 3
## 16 Abdullah, Achmed 1 1
## 17 Abel, Annie Heloise 1 1
## 18 Abelard, Peter 1 1
## 19 Abercrombie, Lascelles 1 1
## 20 Abernathy, Robert -1 5
Filter out to only authors with 50 or more titles
# Filter authors with at least 50 titles
most_popular <- subset(mean_score_by_author, count >= 50)
# Arrange by average sentiment score
most_popular |> arrange(avg_sentiment_score)
## # A tibble: 16 × 3
## author avg_sentiment_score count
## <chr> <dbl> <int>
## 1 United States. Work Projects Administration -1 69
## 2 Twain, Mark -0.803 61
## 3 Carter, Nicholas (House name) -0.709 55
## 4 Shakespeare, William -0.692 52
## 5 Le Queux, William -0.667 54
## 6 Various -0.315 981
## 7 Stratemeyer, Edward -0.108 65
## 8 Motley, John Lothrop -0.0513 78
## 9 Defoe, Daniel 0.0566 53
## 10 Standish, Burt L. 0.0943 53
## 11 Unknown 0.133 83
## 12 Anonymous 0.175 378
## 13 Alger, Horatio, Jr. 0.229 70
## 14 Baum, L. Frank (Lyman Frank) 0.439 57
## 15 Dante Alighieri 0.492 59
## 16 Appleton, Victor 0.698 53
head(most_popular, 20)
## # A tibble: 16 × 3
## author avg_sentiment_score count
## <chr> <dbl> <int>
## 1 Alger, Horatio, Jr. 0.229 70
## 2 Anonymous 0.175 378
## 3 Appleton, Victor 0.698 53
## 4 Baum, L. Frank (Lyman Frank) 0.439 57
## 5 Carter, Nicholas (House name) -0.709 55
## 6 Dante Alighieri 0.492 59
## 7 Defoe, Daniel 0.0566 53
## 8 Le Queux, William -0.667 54
## 9 Motley, John Lothrop -0.0513 78
## 10 Shakespeare, William -0.692 52
## 11 Standish, Burt L. 0.0943 53
## 12 Stratemeyer, Edward -0.108 65
## 13 Twain, Mark -0.803 61
## 14 United States. Work Projects Administration -1 69
## 15 Unknown 0.133 83
## 16 Various -0.315 981
Instead of running sentiment analysis on total amount of text in the
titles from different authors because it will skew to the longer titles
- I want to group by title then by author so each title is weighed
equally
# Calculate average sentiment score by Gutenberg ID
new_sentiment_join <- sentiment_w_score |> group_by(gutenberg_id) |>
summarize(avg_sentiment_score = mean(sentiment_score), count = n())
head(new_sentiment_join, 20)
## # A tibble: 20 × 3
## gutenberg_id avg_sentiment_score count
## <int> <dbl> <int>
## 1 6 0 2
## 2 13 -1 1
## 3 15 -1 1
## 4 16 -1 1
## 5 20 0 2
## 6 23 -1 1
## 7 26 0 2
## 8 41 -1 1
## 9 42 -1 1
## 10 43 -1 1
## 11 54 1 1
## 12 55 1 1
## 13 56 -1 1
## 14 57 1 1
## 15 58 1 1
## 16 59 1 1
## 17 66 1 1
## 18 71 -1 1
## 19 73 1 1
## 20 84 1 1
Subset to verify there are scores between [-1,1]
# QA: Filter sentiment scores within range
subset(new_sentiment_join, avg_sentiment_score > -1 & avg_sentiment_score < 1 & avg_sentiment_score != 0)
## # A tibble: 326 × 3
## gutenberg_id avg_sentiment_score count
## <int> <dbl> <int>
## 1 636 0.333 3
## 2 713 0.333 3
## 3 884 0.333 3
## 4 949 0.333 3
## 5 958 -0.333 3
## 6 961 -0.2 5
## 7 2052 -0.6 5
## 8 2062 -0.333 3
## 9 2389 0.333 3
## 10 2517 0.333 3
## # ℹ 316 more rows
Join data frames and omit nulls
# Left join with Gutenberg metadata
new_df <- left_join(new_sentiment_join, gute, by = 'gutenberg_id')
new_df$author <- na.omit(new_df$author)
head(new_df, 20)
## # A tibble: 20 × 10
## gutenberg_id avg_sentiment_score count title author gutenberg_author_id
## <int> <dbl> <int> <chr> <chr> <int>
## 1 6 0 2 Give Me Li… Henry… 4
## 2 13 -1 1 The Huntin… Carro… 7
## 3 15 -1 1 Moby-Dick;… Melvi… 9
## 4 16 -1 1 Peter Pan Barri… 10
## 5 20 0 2 Paradise L… Milto… 17
## 6 23 -1 1 Narrative … Dougl… 34510
## 7 26 0 2 Paradise L… Milto… 17
## 8 41 -1 1 The Legend… Irvin… 34
## 9 42 -1 1 The Strang… Steve… 35
## 10 43 -1 1 The Strang… Steve… 35
## 11 54 1 1 The Marvel… Baum,… 42
## 12 55 1 1 The Wonder… Baum,… 42
## 13 56 -1 1 NREN for A… Polly… 38
## 14 57 1 1 Aladdin an… Unkno… 49
## 15 58 1 1 Paradise R… Milto… 17
## 16 59 1 1 Discourse … Desca… 44
## 17 66 1 1 The Dawn o… Joly,… 50
## 18 71 -1 1 On the Dut… Thore… 54
## 19 73 1 1 The Red Ba… Crane… 55
## 20 84 1 1 Frankenste… Shell… 61
## # ℹ 4 more variables: language <chr>, gutenberg_bookshelf <chr>, rights <chr>,
## # has_text <lgl>
Create mean score by author based on title sentiment score
# Calculate average title sentiment score by author
average_title_by_author <- new_df |> group_by(author) |>
summarize(title_sentiment_score = mean(avg_sentiment_score), count = n())
head(average_title_by_author, 20)
## # A tibble: 20 × 3
## author title_sentiment_score count
## <chr> <dbl> <int>
## 1 A British officer -1 1
## 2 A Californian -1 1
## 3 A-No. 1 -0.333 1
## 4 A. L. O. E. 0.6 5
## 5 A.L.O.C. -1 1
## 6 Aaron, S. F. (Samuel Francis) 1 1
## 7 Abbot, Francis Ellingwood 1 1
## 8 Abbot, Robert, cook 1 1
## 9 Abbott, Charles C. (Charles Conrad) 0 2
## 10 Abbott, Edwin Abbott 1 1
## 11 Abbott, Eleanor Hallowell -0.167 6
## 12 Abbott, Henry -1 1
## 13 Abbott, Jacob 0.556 3
## 14 Abbott, Jane 1 1
## 15 Abbott, John S. C. (John Stevens Cabot) -0.333 3
## 16 Abdullah, Achmed 1 1
## 17 Abel, Annie Heloise 1 1
## 18 Abelard, Peter 1 1
## 19 Abercrombie, Lascelles 1 1
## 20 Abernathy, Robert -1 5
Filter only authors with 25+ titles
# Filter authors with at least 25 titles
author_sentiment <- subset(average_title_by_author, count >= 25)
# Arrange by title sentiment score
author_sentiment |> arrange(title_sentiment_score)
## # A tibble: 29 × 3
## author title_sentiment_score count
## <chr> <dbl> <int>
## 1 United States. Work Projects Administration -1 34
## 2 Carter, Nicholas (House name) -0.789 38
## 3 Twain, Mark -0.782 55
## 4 Snell, Roy J. (Roy Judson) -0.76 25
## 5 Doyle, Arthur Conan -0.688 32
## 6 Shakespeare, William -0.686 51
## 7 Le Queux, William -0.667 40
## 8 Howells, William Dean -0.6 30
## 9 Hope, Laura Lee -0.5 28
## 10 Fenn, George Manville -0.4 40
## # ℹ 19 more rows
head(author_sentiment, 20)
## # A tibble: 20 × 3
## author title_sentiment_score count
## <chr> <dbl> <int>
## 1 Alger, Horatio, Jr. 0.253 50
## 2 Anonymous 0.213 236
## 3 Appleton, Victor 0.667 32
## 4 Ballantyne, R. M. (Robert Michael) -0.121 33
## 5 Baum, L. Frank (Lyman Frank) 0.578 42
## 6 Carter, Nicholas (House name) -0.789 38
## 7 Dante Alighieri 0.564 39
## 8 Defoe, Daniel 0.202 28
## 9 Dickens, Charles -0.371 35
## 10 Doyle, Arthur Conan -0.688 32
## 11 Fenn, George Manville -0.4 40
## 12 Henty, G. A. (George Alfred) -0.0833 36
## 13 Hope, Laura Lee -0.5 28
## 14 Howells, William Dean -0.6 30
## 15 Jacobs, W. W. (William Wymark) -0.370 36
## 16 Kingston, William Henry Giles 0.0882 34
## 17 Le Queux, William -0.667 40
## 18 Motley, John Lothrop -0.075 40
## 19 Parker, Gilbert 0.395 43
## 20 Shakespeare, William -0.686 51
QA why this author is perfectly -1
# Subset Gutenberg metadata by specific author
subset(gute, author == 'United States. Work Projects Administration')
## # A tibble: 34 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 11255 Slave N… Unite… 3906 en Slavery
## 2 11422 Slave N… Unite… 3906 en Slavery
## 3 11485 Slave N… Unite… 3906 en Slavery
## 4 11544 Slave N… Unite… 3906 en Slavery
## 5 11552 Slave N… Unite… 3906 en Slavery
## 6 11709 Slave N… Unite… 3906 en Slavery
## 7 11920 Slave N… Unite… 3906 en Slavery
## 8 12055 Slave N… Unite… 3906 en Slavery
## 9 12297 Slave N… Unite… 3906 en Slavery
## 10 13217 Slave N… Unite… 3906 en Slavery
## # ℹ 24 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>
View most positive authors
# Arrange author sentiment in descending order
author_sentiment |> arrange(desc(title_sentiment_score))
## # A tibble: 29 × 3
## author title_sentiment_score count
## <chr> <dbl> <int>
## 1 Weymouth, Richard Francis 1 28
## 2 Appleton, Victor 0.667 32
## 3 Baum, L. Frank (Lyman Frank) 0.578 42
## 4 Dante Alighieri 0.564 39
## 5 Parker, Gilbert 0.395 43
## 6 Alger, Horatio, Jr. 0.253 50
## 7 Anonymous 0.213 236
## 8 Defoe, Daniel 0.202 28
## 9 Wells, H. G. (Herbert George) 0.173 27
## 10 Standish, Burt L. 0.167 34
## # ℹ 19 more rows
head(author_sentiment, 20)
## # A tibble: 20 × 3
## author title_sentiment_score count
## <chr> <dbl> <int>
## 1 Alger, Horatio, Jr. 0.253 50
## 2 Anonymous 0.213 236
## 3 Appleton, Victor 0.667 32
## 4 Ballantyne, R. M. (Robert Michael) -0.121 33
## 5 Baum, L. Frank (Lyman Frank) 0.578 42
## 6 Carter, Nicholas (House name) -0.789 38
## 7 Dante Alighieri 0.564 39
## 8 Defoe, Daniel 0.202 28
## 9 Dickens, Charles -0.371 35
## 10 Doyle, Arthur Conan -0.688 32
## 11 Fenn, George Manville -0.4 40
## 12 Henty, G. A. (George Alfred) -0.0833 36
## 13 Hope, Laura Lee -0.5 28
## 14 Howells, William Dean -0.6 30
## 15 Jacobs, W. W. (William Wymark) -0.370 36
## 16 Kingston, William Henry Giles 0.0882 34
## 17 Le Queux, William -0.667 40
## 18 Motley, John Lothrop -0.075 40
## 19 Parker, Gilbert 0.395 43
## 20 Shakespeare, William -0.686 51
QA why Weymouth is excatly 1
# Subset Gutenberg metadata by specific author
subset(gute, author == 'Weymouth, Richard Francis')
## # A tibble: 28 × 8
## gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
## <int> <chr> <chr> <int> <chr> <chr>
## 1 8827 Weymout… Weymo… 2897 en ""
## 2 8828 Weymout… Weymo… 2897 en ""
## 3 8829 Weymout… Weymo… 2897 en ""
## 4 8830 Weymout… Weymo… 2897 en ""
## 5 8831 Weymout… Weymo… 2897 en ""
## 6 8832 Weymout… Weymo… 2897 en ""
## 7 8833 Weymout… Weymo… 2897 en ""
## 8 8834 Weymout… Weymo… 2897 en ""
## 9 8835 Weymout… Weymo… 2897 en ""
## 10 8836 Weymout… Weymo… 2897 en ""
## # ℹ 18 more rows
## # ℹ 2 more variables: rights <chr>, has_text <lgl>