library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
library(wordcloud)
## Loading required package: RColorBrewer
library(Matrix)
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
book <- gutenberg_download(c(74))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
doc = paste0(book$text,collapse = "")%>%str_replace_all("「","")%>%str_replace_all("」","")
docVector = unlist(strsplit(book$text,"[,。?]"), use.names=FALSE)
book = data.frame(gutenberg_id = "74" ,stringsAsFactors=F, text = docVector) %>%
filter(text != '') %>% distinct(gutenberg_id, text)
head(book)
## gutenberg_id
## 1 74
## 2 74
## 3 74
## 4 74
## 5 74
## 6 74
## text
## 1 THE ADVENTURES OF TOM SAWYER
## 2 By Mark Twain
## 3 (Samuel Langhorne Clemens)
## 4 CONTENTS
## 5 CHAPTER I. Y-o-u-u Tom-Aunt Polly Decides Upon her Duty--Tom Practices
## 6 Music--The Challenge--A Private Entrance
book1 <- book %>% mutate(linenumber = row_number(),
chapter = cumsum(str_detect(book$text, regex("^CHAPTER ")))) %>%
ungroup() %>%
unnest_tokens(word, text)%>%
filter(!word %in% stop_words$word)
book_sentiment <- book1 %>%
inner_join(get_sentiments("bing")) %>%
count(chapter, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
ggplot(book_sentiment, aes(index, sentiment, fill = chapter)) +
geom_col(show.legend = FALSE)

com_word <- book1 %>%
count(chapter, word, sort = TRUE)
total_words <- com_word %>%
group_by(chapter) %>%
summarize(total = sum(n))
com_word <- left_join(com_word, total_words)
## Joining, by = "chapter"
com_word
## # A tibble: 17,638 x 4
## chapter word n total
## <int> <chr> <int> <int>
## 1 65 tom 58 1028
## 2 41 tom 53 1260
## 3 53 tom 44 845
## 4 67 huck 44 1121
## 5 67 tom 40 1121
## 6 65 becky 38 1028
## 7 45 tom 36 732
## 8 60 tom 33 1502
## 9 60 huck 31 1502
## 10 69 tom 31 635
## # ... with 17,628 more rows
freq_by_rank <- com_word %>%
group_by(chapter) %>%
mutate(rank = row_number(),
`term frequency` = n/total)
freq_by_rank
## # A tibble: 17,638 x 6
## # Groups: chapter [70]
## chapter word n total rank `term frequency`
## <int> <chr> <int> <int> <int> <dbl>
## 1 65 tom 58 1028 1 0.0564
## 2 41 tom 53 1260 1 0.0421
## 3 53 tom 44 845 1 0.0521
## 4 67 huck 44 1121 1 0.0393
## 5 67 tom 40 1121 2 0.0357
## 6 65 becky 38 1028 2 0.0370
## 7 45 tom 36 732 1 0.0492
## 8 60 tom 33 1502 1 0.0220
## 9 60 huck 31 1502 2 0.0206
## 10 69 tom 31 635 1 0.0488
## # ... with 17,628 more rows
bt_words <- com_word %>%
bind_tf_idf(word, chapter, n)
bt_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 17,638 x 6
## chapter word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 5 climax 1 0.25 3.56 0.889
## 2 27 detectives 1 0.2 4.25 0.850
## 3 27 doubts 1 0.2 4.25 0.850
## 4 19 xix 1 0.2 3.56 0.711
## 5 27 xxvii 1 0.2 3.56 0.711
## 6 7 treaty 1 0.167 4.25 0.708
## 7 8 enacted 1 0.167 4.25 0.708
## 8 28 mounts 1 0.167 4.25 0.708
## 9 5 minister 1 0.25 2.46 0.614
## 10 12 generosity 1 0.143 4.25 0.607
## # ... with 17,628 more rows
bt_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(chapter) %>%
top_n(15) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = chapter)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~chapter, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
