packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales","janeaustenr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
library(stringr)
library(wordcloud2)
library(ggplot2)
library(tidyr)
library(scales)
library(janeaustenr)
1.2
text <- c("Because I could not stop for Death -",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop for Death -"
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
text_df <- tibble(line = 1:4, text = text)
text_df
## # A tibble: 4 x 2
## line text
## <int> <chr>
## 1 1 Because I could not stop for Death -
## 2 2 He kindly stopped for me -
## 3 3 The Carriage held but just Ourselves -
## 4 4 and Immortality
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 x 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 for
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
#unnest_tokens(Data frame, output, input, token = "words")
#token:Unit for tokenizing,"words" (default), "characters", "ngrams", "skip_ngrams", "sentences", "lines"...
austen_books() %>% View()
Jane Austen完成的六部小說,資料型態包含text和book
original_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),#保留資料row_num
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
#str_detect比對text和regex出現的次數透過cumsum累加
#regex正規表示式,^開頭,\d數字,ivxlc出現羅馬數字i或x或...
#ignore_case = TRUE忽略大小寫
original_books
## # A tibble: 73,422 x 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 SENSE AND SENSIBILITY Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 by Jane Austen Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 (1811) Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 CHAPTER 1 Sense & Sensibility 10 1
## # ... with 73,412 more rows
tidy_books <- original_books %>%
unnest_tokens(word, text)
#unnest_tokens(word, text)將text欄位抓出tokens,新增至word
tidy_books
## # A tibble: 725,055 x 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ... with 725,045 more rows
data(stop_words)
tidy_books <- tidy_books %>%
anti_join(stop_words)
#anti_join用法,將tidy_books裡出現stop_words的word刪除
head(stop_words)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
從字庫(lexicon)=SMART抓出
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 13,914 x 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ... with 13,904 more rows
#根據word計算次數並排序,如下
tidy_books %>% group_by(word) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 13,914 x 2
## word count
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # ... with 13,904 more rows
#如上
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n))
## # A tibble: 13 x 2
## word n
## <fct> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## 11 elizabeth 687
## 12 elinor 623
## 13 hope 601
#filter(n > 600)取出部分row
#word = reorder(word, n)將word重新排序依照n
library(ggplot2)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
#coord_flip()將x軸與y軸反過來
範例: 重點流程(基本):先將資料載入->在將stop word移除->統計次數->畫圖
hgwells <- gutenberg_download(c(35, 36, 5230, 159))
http://www.gutenberg.org/由此可下載書Ebook.No
tidy_hgwells <- hgwells %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_hgwells %>%
count(word, sort = TRUE)
## # A tibble: 11,769 x 2
## word n
## <chr> <int>
## 1 time 454
## 2 people 302
## 3 door 260
## 4 heard 249
## 5 black 232
## 6 stood 229
## 7 white 222
## 8 hand 218
## 9 kemp 213
## 10 eyes 210
## # ... with 11,759 more rows
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767))
tidy_bronte <- bronte %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
tidy_bronte %>%
count(word, sort = TRUE)
## # A tibble: 23,050 x 2
## word n
## <chr> <int>
## 1 time 1065
## 2 miss 855
## 3 day 827
## 4 hand 768
## 5 eyes 713
## 6 night 647
## 7 heart 638
## 8 looked 601
## 9 door 592
## 10 half 586
## # ... with 23,040 more rows
bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen"))
## # A tibble: 535,379 x 6
## gutenberg_id word author book linenumber chapter
## <int> <chr> <chr> <fct> <int> <int>
## 1 767 _facsimile Bronte Sisters <NA> NA NA
## 2 767 title Bronte Sisters <NA> NA NA
## 3 767 page Bronte Sisters <NA> NA NA
## 4 767 edition_ Bronte Sisters <NA> NA NA
## 5 767 _which Bronte Sisters <NA> NA NA
## 6 767 issued_ Bronte Sisters <NA> NA NA
## 7 767 _together Bronte Sisters <NA> NA NA
## 8 767 with_ Bronte Sisters <NA> NA NA
## 9 767 _wuthering Bronte Sisters <NA> NA NA
## 10 767 heights_ Bronte Sisters <NA> NA NA
## # ... with 535,369 more rows
#bind_rows:將三位作者依照row合併
bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word)
## # A tibble: 48,018 x 3
## author word n
## <chr> <chr> <int>
## 1 Bronte Sisters a 8
## 2 Bronte Sisters a'most 4
## 3 Bronte Sisters aback 1
## 4 Bronte Sisters abaht 1
## 5 Bronte Sisters abandon 8
## 6 Bronte Sisters abandoned 23
## 7 Bronte Sisters abandoning 1
## 8 Bronte Sisters abandonment 5
## 9 Bronte Sisters abase 1
## 10 Bronte Sisters abasement 4
## # ... with 48,008 more rows
#和上個結果比較,show spread用法
bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>% #把col=n拿掉
spread(author, proportion)
## # A tibble: 28,909 x 4
## word `Bronte Sisters` `H.G. Wells` `Jane Austen`
## <chr> <dbl> <dbl> <dbl>
## 1 a 0.0000319 0.0000150 0.00000919
## 2 a'most 0.0000159 NA NA
## 3 a'n't NA NA 0.00000460
## 4 aback 0.00000398 0.0000150 NA
## 5 abaht 0.00000398 NA NA
## 6 abandon 0.0000319 0.0000150 NA
## 7 abandoned 0.0000916 0.000180 0.00000460
## 8 abandoning 0.00000398 0.0000450 NA
## 9 abandonment 0.0000199 0.0000150 NA
## 10 abart NA 0.0000150 NA
## # ... with 28,899 more rows
frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"),
mutate(tidy_hgwells, author = "H.G. Wells"),
mutate(tidy_books, author = "Jane Austen")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>% #把col=n拿掉
spread(author, proportion) %>%
gather(author, proportion, `Brontë Sisters`:`H.G. Wells`)
#str_extract(word, "[a-z']+"):word只取出英文字母的部分,例如_any_修改成any
frequency
## # A tibble: 57,818 x 4
## word `Jane Austen` author proportion
## <chr> <dbl> <chr> <dbl>
## 1 a 0.00000919 Bronte Sisters 0.0000319
## 2 a'most NA Bronte Sisters 0.0000159
## 3 a'n't 0.00000460 Bronte Sisters NA
## 4 aback NA Bronte Sisters 0.00000398
## 5 abaht NA Bronte Sisters 0.00000398
## 6 abandon NA Bronte Sisters 0.0000319
## 7 abandoned 0.00000460 Bronte Sisters 0.0000916
## 8 abandoning NA Bronte Sisters 0.00000398
## 9 abandonment NA Bronte Sisters 0.0000199
## 10 abart NA Bronte Sisters NA
## # ... with 57,808 more rows
#gather把Brontë Sisters`:`H.G. Wells用Austen表示,可比較Austen和另外兩個人的比較
# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~author, ncol = 2) +
theme(legend.position="none") +
labs(y = "Jane Austen", x = NULL)
#geom_abline畫線
#geom_jitter散佈圖設定
#geom_text圖show出字