Instructions

  1. Clone this homework repo to your homework directory as a new repo.
  2. Rename the starter file under the analysis directory as hw_01_yourname.Rmd and use it for your solutions.
  3. Modify the “author” field in the YAML header.
  4. Stage and Commit R Markdown and HTML files (no PDF files).
  5. Push both .Rmd and HTML files to GitHub.
  1. Commit each time you answer a part of question, e.g. 1.1
  2. Push to GitHub after each major question
  3. When complete, submit a response in Canvas

1 Sentiment Analysis

  1. libraries
library(tidyverse)
## ── Attaching packages ─────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
library(gutenbergr)
  1. Download the following two works from the early 20th century from Project Gutenberg:
# check each gutenberg ID
#gutenberg_works() %>%
  #(title == "The Jungle")
#gutenberg_works() %>%
  #filter(str_detect(author," Bois"))
 
theJungle <- gutenberg_download(140)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
theQuest <- gutenberg_download(15265)
  1. Write a function to take an argument of a downloaded book tibble and return it in tidy text format.
tidyJungleAndQuest <- function(tidyBooks){
  stopifnot(is.data.frame(tidyBooks))
  if(tidyBooks$gutenberg_id[[1]] == 140){
  theJungle %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text,regex( "^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>% 
  ungroup() %>%
  unnest_tokens(word, text) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!is.na(word)) -> theJungle
  return(theJungle)
    }else if(tidyBooks$gutenberg_id[[1]] == 15265){
  theQuest %>% 
  mutate(linenumber = row_number(),
         text = recode(text, "_Contents_" = "Contents",
                       "_Note_" = "Note"),
         chapter = cumsum(str_detect(text,regex( "(^_)([a-z]+)([-]{0,1})([a-z]+)(_$)", 
                                                 ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!is.na(word)) ->  theQuest
  return(theQuest)
    }
}
  1. Use the function from step 2
theJungle %>%
  tidyJungleAndQuest() %>%
  mutate(book = "The Jungle",
         author = "Sinclair, Upton") -> theJungle

theQuest %>%
  tidyJungleAndQuest() %>%
  mutate(book = "The Quest of the Silver Fleece: A Novel",
         author = "Du Bois, W. E. B. (William Edward Burghardt)") -> theQuest 
  1. Use a dplyr function to combine the two tibbles into a new tibble.
theJungle %>%
  full_join(theQuest,
            by = c("gutenberg_id", "linenumber", "chapter", 
                   "word", "book", "author")) -> jungleAndQuest 
  1. Measure the net sentiment using bing for each block of 50 lines
jungleAndQuest %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
  pivot_wider(names_from = sentiment, values_from = n, 
              values_fill = list(n = 0)) %>%
  mutate(net = positive - negative) -> jungleAndQuestBing50

ggplot(data = jungleAndQuestBing50, aes(x = index, y = net, fill = book)) + 
  geom_col(show.legend = FALSE) +
  theme_bw() +
  facet_wrap(~book, ncol = 2, scales = "free_x") -> bing50
plot(bing50)

  1. Measure the total for each nrc sentiment in each block of 500 lines and then,
get_sentiments("nrc") %>%
  filter(sentiment != "positive" & sentiment != "negative") %>%
  inner_join(jungleAndQuest, by = "word") %>%
  count(index = linenumber %/% 500, sentiment, book, sort = TRUE) -> 
  nrcSentimentsNoPosNeg

ggplot(data = nrcSentimentsNoPosNeg, 
       aes(x = index, y = n, fill = sentiment)) + 
  geom_col(show.legend = F) +
  theme_bw() +
  facet_grid(~book) +
  labs(x = "nrc sentiment", y = "count") 

  1. Using bing, create a new data frame with the counts of the positive and negative sentiment words for each book.
jungleAndQuest %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(book, word, sentiment, sort = TRUE) %>%
  ungroup() -> bingSentimentsJungleAndQuest

head(bingSentimentsJungleAndQuest, 20)
## # A tibble: 20 x 4
##    book                                    word    sentiment     n
##    <chr>                                   <chr>   <chr>     <int>
##  1 The Quest of the Silver Fleece: A Novel miss    negative    469
##  2 The Quest of the Silver Fleece: A Novel slowly  negative    124
##  3 The Quest of the Silver Fleece: A Novel dark    negative     96
##  4 The Jungle                              poor    negative     80
##  5 The Jungle                              cold    negative     79
##  6 The Jungle                              hard    negative     61
##  7 The Jungle                              lost    negative     61
##  8 The Jungle                              wild    negative     55
##  9 The Quest of the Silver Fleece: A Novel love    positive     55
## 10 The Jungle                              fell    negative     51
## 11 The Quest of the Silver Fleece: A Novel fell    negative     50
## 12 The Quest of the Silver Fleece: A Novel mighty  positive     48
## 13 The Jungle                              death   negative     47
## 14 The Jungle                              free    positive     44
## 15 The Jungle                              killing negative     43
## 16 The Quest of the Silver Fleece: A Novel silent  positive     43
## 17 The Jungle                              cry     negative     41
## 18 The Jungle                              agony   negative     40
## 19 The Quest of the Silver Fleece: A Novel hard    negative     40
## 20 The Jungle                              dead    negative     39
  1. Plot the top ten for each positive and negative sentiment faceting by book.
bingSentimentsJungleAndQuest %>%
  group_by(book, sentiment) %>%
  slice_max(order_by = n, n = 10) %>% 
  mutate(word = reorder_within(word, n, book)) %>%
  ungroup() %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(sentiment ~ book, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  coord_flip() +
  scale_x_reordered() 

  # theme(strip.background = element_blank(), strip.placement = "outside") #review HW1
  1. Remove the inappropriate word(s) from the analysis.
# remove the word “miss” from the bing sentiment lexicon.
get_sentiments("bing") %>%
  filter(word != "miss") -> bing_no_miss

# redo the analysis from the beginning
jungleAndQuest %>%
  inner_join(bing_no_miss, by = "word") %>%
  count(book, word, sentiment, sort = TRUE) %>%
  ungroup() -> bing_word_counts

# visualize it
bing_word_counts %>%
 group_by(book, sentiment) %>%
  slice_max(order_by = n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(sentiment ~ book, scales = "free_y") +
  labs(y = "Contribution to sentiment", x = NULL) +
  coord_flip()

  1. Extra Credit
# original code in 1.5 
jungleAndQuest %>%
  inner_join(get_sentiments("bing"), by = "word") %>% 
  count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
  pivot_wider(names_from = sentiment, values_from = n, 
              values_fill = list(n = 0)) %>%
  mutate(net = positive - negative) %>%
  ggplot(aes(x = index, y = net, fill = book)) + 
  geom_col(show.legend = FALSE) +
  theme_bw() +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  ggtitle("With Miss as Negative") -> p1
  
# No Miss
jungleAndQuest %>%
  inner_join(bing_no_miss, by = "word") %>% 
  count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
  pivot_wider(names_from = sentiment, values_from = n, 
              values_fill = list(n = 0)) %>%
  mutate(net = positive - negative) %>%
  ggplot(aes(x = index, y = net, fill = book)) + 
  geom_col(show.legend = FALSE) +
  theme_bw() +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  ggtitle("Without Miss as Negative")  -> p2

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
doubleBing50 <- grid.arrange(p1, p2, nrow = 2)

2 tf-idf for Mark Twain’s books

  1. Download the following books from Author Mark Twain from Project Gutenberg
gutenberg_works() %>%
  filter(str_detect(author,"Twain")) -> mt

markTwain <- gutenberg_download(c(76, 74, 86, 245, 1837, 119))

adventuresOfHuckleberryFinn <- gutenberg_download(76)
theAdventuresOfTomSawyer <- gutenberg_download(74)
aConnecticutYankeeInKingArthursCourt <- gutenberg_download(86)
lifeOnTheMississippi <- gutenberg_download(245)
thePrinceAndThePauper <- gutenberg_download(1837)
aTrampAbroad <- gutenberg_download(119)

markTwainBooks <- bind_rows(mutate(adventuresOfHuckleberryFinn, book = "Adventures Of Huckleberry Finn"),
          mutate(theAdventuresOfTomSawyer, book = "The Adventures Of Tom Sawyer"),
          mutate(aConnecticutYankeeInKingArthursCourt, book = "A Connecticut Yankee In King Arthurs Court"),
          mutate(lifeOnTheMississippi, book = "Life On The Mississippi"),
          mutate(thePrinceAndThePauper, book = "The Prince And The Pauper"),
          mutate(aTrampAbroad, book = "A Tramp Abroad"))
  1. Modify your earlier function or create a new one to output a tf-idf ready dataframe (leave the the stop words in the text)
readyTfItf <- function(x){
  stopifnot(is.data.frame(markTwainBooks)) #,nrow(x) == 73326))
  markTwainBooks %>%
  unnest_tokens(word, text) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  filter(!is.na(word)) %>%
  count(book, word, sort = TRUE) -> markTwainBooks

  # calculate how many words in each book = total variable
  markTwainBooks %>% 
  group_by(book) %>% 
  summarize(total = sum(n), .groups = "keep") -> ttlWords
  
  markTwainBooks %>% 
  left_join(ttlWords, by = "book") -> markTwainBooks 
  return(markTwainBooks)
}
  1. Calculate the tf-idf
markTwainBooks %>%
  readyTfItf() %>%
  bind_tf_idf(word, book, n) -> markTwainBooks
  1. Plot the tf for each book using a faceted graph.
markTwainBooks %>% 
ggplot(aes(x = tf, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 852 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

  1. Show the words with the 15 highest tf-idfs across across all books
markTwainBooks %>%
  arrange(desc(tf_idf)) %>%
  select(book, tf_idf, everything()) %>%
  head(15)
## # A tibble: 15 x 7
##    book                               tf_idf word         n  total      tf   idf
##    <chr>                               <dbl> <chr>    <int>  <int>   <dbl> <dbl>
##  1 The Prince And The Pauper        0.00406  hendon     161  71104 2.26e-3 1.79 
##  2 The Adventures Of Tom Sawyer     0.00253  becky      102  72190 1.41e-3 1.79 
##  3 The Prince And The Pauper        0.00244  canty       97  71104 1.36e-3 1.79 
##  4 The Adventures Of Tom Sawyer     0.00223  huck       232  72190 3.21e-3 0.693
##  5 The Prince And The Pauper        0.00186  prince     191  71104 2.69e-3 0.693
##  6 Adventures Of Huckleberry Finn   0.00144  en         235 113227 2.08e-3 0.693
##  7 The Adventures Of Tom Sawyer     0.00133  joe        138  72190 1.91e-3 0.693
##  8 The Adventures Of Tom Sawyer     0.00119  sid         78  72190 1.08e-3 1.10 
##  9 Life On The Mississippi          0.00113  pilots      93 147364 6.31e-4 1.79 
## 10 Adventures Of Huckleberry Finn   0.00105  warn't     293 113227 2.59e-3 0.405
## 11 A Connecticut Yankee In King Ar… 0.00101  launcel…    67 119087 5.63e-4 1.79 
## 12 A Connecticut Yankee In King Ar… 0.000993 merlin      66 119087 5.54e-4 1.79 
## 13 The Prince And The Pauper        0.000983 hertford    39  71104 5.48e-4 1.79 
## 14 The Prince And The Pauper        0.000983 hugh        39  71104 5.48e-4 1.79 
## 15 Adventures Of Huckleberry Finn   0.000965 dey         61 113227 5.39e-4 1.79
  1. Plot the top 7 tf_idf words from each book.
markTwainBooks %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = parse_factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  slice_max(order_by = tf_idf, n = 7) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, scales = "free") +
  coord_flip() 

# Extra Credit Podcasts

  1. Sentiment Preserving Fake Reviews
    The Original paper

  2. Data in Life: Authorship Attribution in Lennon-McCartney Songs

  3. Newsha Ajami| Improving Urban Water Systems Through Data Science, Public Policy and Engineering

  1. What are some key ideas from this podcast relevant to text sentiment analysis/authorship attribution (1, or 2) or working with large diverse data sets (3)?
  1. How do you think the ideas discussed may be relevant in your future work?
tidydocs <- function(df){
      nums <- c("(_One_(?!:))|(_Two_(?!:))|(_Three_(?!:))|(_Four_(?!:))|(_Five_(?!:))|(_Six_(?!:))|(_Seven_(?!:))|(_Eight_(?!:))|(_Nine_(?!:))|(_Ten_(?!:))|(_Eleven_(?!:))|(_Twelve_(?!:))|(.+teen_(?!:))|(_Twenty.*_(?!:))|(_Thirty.*_(?!:))")
      c_pattern <-  str_c("(?i)(^chapter [\\divxlc])|(",nums,")", collapse = "")
 
 # Other Options                                        
 # regex("((^chapter [\\divxlc])|(^_\\w+_$)|((^_\\w+-\\w+_$)))", ignore_case = TRUE)  
 #
 # mutate(linenumber=row_number(),
         # chapter1 = cumsum(str_detect(text,
         #                             regex("^chapter [\\divxlc]",
         #                                   ignore_case = TRUE))),
         # chapter2 = cumsum(str_detect(text,
         #                              "^_(Ni|O|T|S|F|E)[:lower:]+[-]*[:lower:]*_$")),
         # chapter=chapter2+chapter1,
         # chapter1=NULL,
         # chapter2=NULL) %>%
 #
 #   
      tidy_df <- df %>%
        # add line and chapter numbers to dataset
        mutate(linenumber = row_number(),
               chapter = cumsum(str_detect(text, c_pattern))) %>%
        # convert all to lower case and put one line per word
        unnest_tokens(word, text) %>%
        # take care of any special formatting characters around words
        mutate(word = str_extract(word, "[a-z']+")) %>%
        # remove stop-words
        anti_join(stop_words) %>%
        # get rid of any NA's
        filter(!is.na(word))
    }