hw_01_yourname.Rmd and use it for your solutions.library(tidyverse)
## ── Attaching packages ─────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
library(gutenbergr)
# check each gutenberg ID
#gutenberg_works() %>%
#(title == "The Jungle")
#gutenberg_works() %>%
#filter(str_detect(author," Bois"))
theJungle <- gutenberg_download(140)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
theQuest <- gutenberg_download(15265)
NAs 8.1.2: - 0 You could have combined the regex into one with an | and not used the if blocks - would have saved lots of duplicate code.tidyJungleAndQuest <- function(tidyBooks){
stopifnot(is.data.frame(tidyBooks))
if(tidyBooks$gutenberg_id[[1]] == 140){
theJungle %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,regex( "^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word") %>%
filter(!is.na(word)) -> theJungle
return(theJungle)
}else if(tidyBooks$gutenberg_id[[1]] == 15265){
theQuest %>%
mutate(linenumber = row_number(),
text = recode(text, "_Contents_" = "Contents",
"_Note_" = "Note"),
chapter = cumsum(str_detect(text,regex( "(^_)([a-z]+)([-]{0,1})([a-z]+)(_$)",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
anti_join(stop_words, by = "word") %>%
filter(!is.na(word)) -> theQuest
return(theQuest)
}
}
book and author as variables and save each tibble to a new variable.theJungle %>%
tidyJungleAndQuest() %>%
mutate(book = "The Jungle",
author = "Sinclair, Upton") -> theJungle
theQuest %>%
tidyJungleAndQuest() %>%
mutate(book = "The Quest of the Silver Fleece: A Novel",
author = "Du Bois, W. E. B. (William Edward Burghardt)") -> theQuest
theJungle %>%
full_join(theQuest,
by = c("gutenberg_id", "linenumber", "chapter",
"word", "book", "author")) -> jungleAndQuest
The Quest of the Silver has more negative words than The Jungle. The majority of both contents have a negative sentiment.jungleAndQuest %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
pivot_wider(names_from = sentiment, values_from = n,
values_fill = list(n = 0)) %>%
mutate(net = positive - negative) -> jungleAndQuestBing50
ggplot(data = jungleAndQuestBing50, aes(x = index, y = net, fill = book)) +
geom_col(show.legend = FALSE) +
theme_bw() +
facet_wrap(~book, ncol = 2, scales = "free_x") -> bing50
plot(bing50)
get_sentiments("nrc") %>%
filter(sentiment != "positive" & sentiment != "negative") %>%
inner_join(jungleAndQuest, by = "word") %>%
count(index = linenumber %/% 500, sentiment, book, sort = TRUE) ->
nrcSentimentsNoPosNeg
ggplot(data = nrcSentimentsNoPosNeg,
aes(x = index, y = n, fill = sentiment)) +
geom_col(show.legend = F) +
theme_bw() +
facet_grid(~book) +
labs(x = "nrc sentiment", y = "count")
jungleAndQuest %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(book, word, sentiment, sort = TRUE) %>%
ungroup() -> bingSentimentsJungleAndQuest
head(bingSentimentsJungleAndQuest, 20)
## # A tibble: 20 x 4
## book word sentiment n
## <chr> <chr> <chr> <int>
## 1 The Quest of the Silver Fleece: A Novel miss negative 469
## 2 The Quest of the Silver Fleece: A Novel slowly negative 124
## 3 The Quest of the Silver Fleece: A Novel dark negative 96
## 4 The Jungle poor negative 80
## 5 The Jungle cold negative 79
## 6 The Jungle hard negative 61
## 7 The Jungle lost negative 61
## 8 The Jungle wild negative 55
## 9 The Quest of the Silver Fleece: A Novel love positive 55
## 10 The Jungle fell negative 51
## 11 The Quest of the Silver Fleece: A Novel fell negative 50
## 12 The Quest of the Silver Fleece: A Novel mighty positive 48
## 13 The Jungle death negative 47
## 14 The Jungle free positive 44
## 15 The Jungle killing negative 43
## 16 The Quest of the Silver Fleece: A Novel silent positive 43
## 17 The Jungle cry negative 41
## 18 The Jungle agony negative 40
## 19 The Quest of the Silver Fleece: A Novel hard negative 40
## 20 The Jungle dead negative 39
miss from the bing sentiment lexicon, because miss is not totally a negative word. For instance: I miss you baby. 8.1.8a: - 1 Did not get 10 words for each sentiment, Need to use reorder_within and scale_x_reordered 0 I miss you baby is negative so it is most likely the use as part of someone’s name, Miss Jones, Miss SmithbingSentimentsJungleAndQuest %>%
group_by(book, sentiment) %>%
slice_max(order_by = n, n = 10) %>%
mutate(word = reorder_within(word, n, book)) %>%
ungroup() %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(sentiment ~ book, scales = "free_y") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip() +
scale_x_reordered()
# theme(strip.background = element_blank(), strip.placement = "outside") #review HW1
miss, we can detect that The Quest has many negative sentiments. Based on the top ten ranking for each, Negative sentiments are more than positive sentiments in both books.# remove the word “miss” from the bing sentiment lexicon.
get_sentiments("bing") %>%
filter(word != "miss") -> bing_no_miss
# redo the analysis from the beginning
jungleAndQuest %>%
inner_join(bing_no_miss, by = "word") %>%
count(book, word, sentiment, sort = TRUE) %>%
ungroup() -> bing_word_counts
# visualize it
bing_word_counts %>%
group_by(book, sentiment) %>%
slice_max(order_by = n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(sentiment ~ book, scales = "free_y") +
labs(y = "Contribution to sentiment", x = NULL) +
coord_flip()
miss,the plot shows there is no big change in the analysis.# original code in 1.5
jungleAndQuest %>%
inner_join(get_sentiments("bing"), by = "word") %>%
count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
pivot_wider(names_from = sentiment, values_from = n,
values_fill = list(n = 0)) %>%
mutate(net = positive - negative) %>%
ggplot(aes(x = index, y = net, fill = book)) +
geom_col(show.legend = FALSE) +
theme_bw() +
facet_wrap(~book, ncol = 2, scales = "free_x") +
ggtitle("With Miss as Negative") -> p1
# No Miss
jungleAndQuest %>%
inner_join(bing_no_miss, by = "word") %>%
count(index = linenumber %/% 50, sentiment, book, sort = TRUE) %>%
pivot_wider(names_from = sentiment, values_from = n,
values_fill = list(n = 0)) %>%
mutate(net = positive - negative) %>%
ggplot(aes(x = index, y = net, fill = book)) +
geom_col(show.legend = FALSE) +
theme_bw() +
facet_wrap(~book, ncol = 2, scales = "free_x") +
ggtitle("Without Miss as Negative") -> p2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
doubleBing50 <- grid.arrange(p1, p2, nrow = 2)
gutenberg_works() %>%
filter(str_detect(author,"Twain")) -> mt
markTwain <- gutenberg_download(c(76, 74, 86, 245, 1837, 119))
adventuresOfHuckleberryFinn <- gutenberg_download(76)
theAdventuresOfTomSawyer <- gutenberg_download(74)
aConnecticutYankeeInKingArthursCourt <- gutenberg_download(86)
lifeOnTheMississippi <- gutenberg_download(245)
thePrinceAndThePauper <- gutenberg_download(1837)
aTrampAbroad <- gutenberg_download(119)
markTwainBooks <- bind_rows(mutate(adventuresOfHuckleberryFinn, book = "Adventures Of Huckleberry Finn"),
mutate(theAdventuresOfTomSawyer, book = "The Adventures Of Tom Sawyer"),
mutate(aConnecticutYankeeInKingArthursCourt, book = "A Connecticut Yankee In King Arthurs Court"),
mutate(lifeOnTheMississippi, book = "Life On The Mississippi"),
mutate(thePrinceAndThePauper, book = "The Prince And The Pauper"),
mutate(aTrampAbroad, book = "A Tramp Abroad"))
NAs and then,readyTfItf <- function(x){
stopifnot(is.data.frame(markTwainBooks)) #,nrow(x) == 73326))
markTwainBooks %>%
unnest_tokens(word, text) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
filter(!is.na(word)) %>%
count(book, word, sort = TRUE) -> markTwainBooks
# calculate how many words in each book = total variable
markTwainBooks %>%
group_by(book) %>%
summarize(total = sum(n), .groups = "keep") -> ttlWords
markTwainBooks %>%
left_join(ttlWords, by = "book") -> markTwainBooks
return(markTwainBooks)
}
markTwainBooks %>%
readyTfItf() %>%
bind_tf_idf(word, book, n) -> markTwainBooks
markTwainBooks %>%
ggplot(aes(x = tf, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 852 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).
markTwainBooks %>%
arrange(desc(tf_idf)) %>%
select(book, tf_idf, everything()) %>%
head(15)
## # A tibble: 15 x 7
## book tf_idf word n total tf idf
## <chr> <dbl> <chr> <int> <int> <dbl> <dbl>
## 1 The Prince And The Pauper 0.00406 hendon 161 71104 2.26e-3 1.79
## 2 The Adventures Of Tom Sawyer 0.00253 becky 102 72190 1.41e-3 1.79
## 3 The Prince And The Pauper 0.00244 canty 97 71104 1.36e-3 1.79
## 4 The Adventures Of Tom Sawyer 0.00223 huck 232 72190 3.21e-3 0.693
## 5 The Prince And The Pauper 0.00186 prince 191 71104 2.69e-3 0.693
## 6 Adventures Of Huckleberry Finn 0.00144 en 235 113227 2.08e-3 0.693
## 7 The Adventures Of Tom Sawyer 0.00133 joe 138 72190 1.91e-3 0.693
## 8 The Adventures Of Tom Sawyer 0.00119 sid 78 72190 1.08e-3 1.10
## 9 Life On The Mississippi 0.00113 pilots 93 147364 6.31e-4 1.79
## 10 Adventures Of Huckleberry Finn 0.00105 warn't 293 113227 2.59e-3 0.405
## 11 A Connecticut Yankee In King Ar… 0.00101 launcel… 67 119087 5.63e-4 1.79
## 12 A Connecticut Yankee In King Ar… 0.000993 merlin 66 119087 5.54e-4 1.79
## 13 The Prince And The Pauper 0.000983 hertford 39 71104 5.48e-4 1.79
## 14 The Prince And The Pauper 0.000983 hugh 39 71104 5.48e-4 1.79
## 15 Adventures Of Huckleberry Finn 0.000965 dey 61 113227 5.39e-4 1.79
markTwainBooks %>%
arrange(desc(tf_idf)) %>%
mutate(word = parse_factor(word, levels = rev(unique(word)))) %>%
group_by(book) %>%
slice_max(order_by = tf_idf, n = 7) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = book)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~book, scales = "free") +
coord_flip()
# Extra Credit Podcasts
Data in Life: Authorship Attribution in Lennon-McCartney Songs
Newsha Ajami| Improving Urban Water Systems Through Data Science, Public Policy and Engineering
Sentiment Preserving Fake Reviews for answering question.As technology constantly evolves, computer comes to compiling and analyzing structured data through Natural Language Processing. For example, as we learned sentiment words data frame this week, we can directly analyze many words and articles by computer, avoid too subjective effects by human beings. So how to organize and optimal words NLP maybe is the most relevant in the future for us.
Note: the elegant way to tidy book
tidydocs <- function(df){
nums <- c("(_One_(?!:))|(_Two_(?!:))|(_Three_(?!:))|(_Four_(?!:))|(_Five_(?!:))|(_Six_(?!:))|(_Seven_(?!:))|(_Eight_(?!:))|(_Nine_(?!:))|(_Ten_(?!:))|(_Eleven_(?!:))|(_Twelve_(?!:))|(.+teen_(?!:))|(_Twenty.*_(?!:))|(_Thirty.*_(?!:))")
c_pattern <- str_c("(?i)(^chapter [\\divxlc])|(",nums,")", collapse = "")
# Other Options
# regex("((^chapter [\\divxlc])|(^_\\w+_$)|((^_\\w+-\\w+_$)))", ignore_case = TRUE)
#
# mutate(linenumber=row_number(),
# chapter1 = cumsum(str_detect(text,
# regex("^chapter [\\divxlc]",
# ignore_case = TRUE))),
# chapter2 = cumsum(str_detect(text,
# "^_(Ni|O|T|S|F|E)[:lower:]+[-]*[:lower:]*_$")),
# chapter=chapter2+chapter1,
# chapter1=NULL,
# chapter2=NULL) %>%
#
#
tidy_df <- df %>%
# add line and chapter numbers to dataset
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, c_pattern))) %>%
# convert all to lower case and put one line per word
unnest_tokens(word, text) %>%
# take care of any special formatting characters around words
mutate(word = str_extract(word, "[a-z']+")) %>%
# remove stop-words
anti_join(stop_words) %>%
# get rid of any NA's
filter(!is.na(word))
}