library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## The following object is masked from 'package:tidyr':
## 
##     smiths
setwd("/Users/arielchang/Desktop/20190328_H2 2")
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
orginalDF = read.table("消失的愛人(控制).txt",header= F ,sep="\n",fileEncoding = "UTF-8", stringsAsFactors=F)
head(orginalDF)
##                                                               V1
## 1                                                     消失的愛人
## 2                                            [美]吉莉安‧弗琳 著
## 3                                                        胡緋 譯
## 4                                                     中信出版社
## 5                                       圖書在版編目( CIP)數據
## 6 消失的愛人 /(美)弗琳著;胡緋譯 . —北京:中信出版社, 2013.6

1.3 原始資料過濾及分析

過濾評論及其他等前95行內容

novelDf = data_frame(text =orginalDF[-(1:95),])
head(novelDf)
## # A tibble: 6 x 1
##   text                                                                    
##   <chr>                                                                   
## 1 第一部分                                                                
## 2 芳蹤難覓                                                                
## 3 尼克‧鄧恩                                                               
## 4 事發當日                                                                
## 5 每當想起我太太,我總會想起她那顆頭顱。最先想起的是輪廓:第一眼見到她時,我望見的就是她的後腦,那頭顱有著某種曼妙之處,好似一粒閃亮堅硬的玉米,…
## 6 不管在哪兒,我都不會錯認她那顆小腦袋。

以逗號、句號、問號作為斷句的規則

novelVector =  unlist(strsplit(novelDf$text,"[,。?]"), use.names=FALSE) 
head(novelVector)
## [1] "第一部分"             "芳蹤難覓"             "尼克‧鄧恩"          
## [4] "事發當日"             "每當想起我太太"       "我總會想起她那顆頭顱"

將上一步驟取得Vector轉為DataFrame並增加欄位「line」與「chapter」

novelDf = data_frame(text=novelVector)  %>%
  mutate(line=c(1:nrow(.))) %>% 
  mutate(chapter = cumsum(str_detect(.$text, regex("^第.*部分$"))))
head(novelDf)
## # A tibble: 6 x 3
##   text                  line chapter
##   <chr>                <int>   <int>
## 1 第一部分                 1       1
## 2 芳蹤難覓                 2       1
## 3 尼克‧鄧恩                3       1
## 4 事發當日                 4       1
## 5 每當想起我太太           5       1
## 6 我總會想起她那顆頭顱     6       1

1.6 結巴分詞 function

1.6.1 定義結巴分詞需要的字詞庫及 function
setwd("/Users/arielchang/Desktop/20190328_H2 2")
jieba_tokenizer <- worker(stop_word = "stop_words.txt",user="user_words.txt")
#save(DiaryAll,file="AmyAndNick.RData")
#定義丟給unnest_tokens的分詞
book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    # 將詞彙長度為1的詞清除
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}
1.6.2 透過unnest_tokens段詞功能,可以保留斷詞欄位以外欄位資料,例如章節
tidybook = novelDf %>% unnest_tokens(word,text,token= book_tokenizer)  
head(tidybook)
## # A tibble: 6 x 3
##    line chapter word 
##   <int>   <int> <chr>
## 1     1       1 第一 
## 2     1       1 部分 
## 3     2       1 芳蹤 
## 4     2       1 難覓 
## 5     3       1 尼克 
## 6     4       1 事發

統計每個詞出現的次數

book_words1 <- tidybook %>%
    count(chapter, word, sort = TRUE)

新增每章節總字數欄位

total_words1 <- book_words1 %>% 
  group_by(chapter) %>% 
  summarize(total = sum(n))

book_words1 <- left_join(book_words1, total_words1)
## Joining, by = "chapter"
book_words1
## # A tibble: 25,586 x 4
##    chapter word      n total
##      <int> <chr> <int> <int>
##  1       1 艾米    501 43930
##  2       1 一個    494 43930
##  3       1 尼克    357 43930
##  4       2 尼克    317 28349
##  5       1 已經    288 43930
##  6       2 艾米    286 28349
##  7       2 一個    242 28349
##  8       1 知道    215 43930
##  9       2 已經    204 28349
## 10       1 彷彿    181 43930
## # ... with 25,576 more rows

每章詞性與總字數相除並畫長條圖

ggplot(book_words1, aes(n/total, fill = chapter)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~chapter, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 425 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).

###examine Zipf’s law

增加rank及詞性頻率

freq_by_rank <- book_words1 %>% 
  group_by(chapter) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
## # A tibble: 25,586 x 6
## # Groups:   chapter [3]
##    chapter word      n total  rank `term frequency`
##      <int> <chr> <int> <int> <int>            <dbl>
##  1       1 艾米    501 43930     1          0.0114 
##  2       1 一個    494 43930     2          0.0112 
##  3       1 尼克    357 43930     3          0.00813
##  4       2 尼克    317 28349     1          0.0112 
##  5       1 已經    288 43930     4          0.00656
##  6       2 艾米    286 28349     2          0.0101 
##  7       2 一個    242 28349     3          0.00854
##  8       1 知道    215 43930     5          0.00489
##  9       2 已經    204 28349     4          0.00720
## 10       1 彷彿    181 43930     6          0.00412
## # ... with 25,576 more rows

examine Zipf’s law

畫出rank及詞性頻率的關係線圖

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = chapter)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

###examine Zipf’s law

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -1.6701      -0.6563
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = chapter)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

###bind_tf_idf ###每章節前15個重要的字詞,畫視覺圖

book_words1 <- book_words1 %>%
  bind_tf_idf(word, chapter, n)
book_words1
## # A tibble: 25,586 x 7
##    chapter word      n total      tf   idf tf_idf
##      <int> <chr> <int> <int>   <dbl> <dbl>  <dbl>
##  1       1 艾米    501 43930 0.0114      0      0
##  2       1 一個    494 43930 0.0112      0      0
##  3       1 尼克    357 43930 0.00813     0      0
##  4       2 尼克    317 28349 0.0112      0      0
##  5       1 已經    288 43930 0.00656     0      0
##  6       2 艾米    286 28349 0.0101      0      0
##  7       2 一個    242 28349 0.00854     0      0
##  8       1 知道    215 43930 0.00489     0      0
##  9       2 已經    204 28349 0.00720     0      0
## 10       1 彷彿    181 43930 0.00412     0      0
## # ... with 25,576 more rows
book_words1 %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 25,586 x 6
##    chapter word         n       tf   idf   tf_idf
##      <int> <chr>    <int>    <dbl> <dbl>    <dbl>
##  1       3 返家        11 0.00137  1.10  0.00151 
##  2       2 莎朗        35 0.00123  1.10  0.00136 
##  3       3 短片         9 0.00112  1.10  0.00124 
##  4       2 多蘿西      26 0.000917 1.10  0.00101 
##  5       2 傑夫        69 0.00243  0.405 0.000987
##  6       3 強暴         7 0.000874 1.10  0.000961
##  7       3 證據        17 0.00212  0.405 0.000861
##  8       1 酒吧        93 0.00212  0.405 0.000858
##  9       2 貝爾        18 0.000635 1.10  0.000698
## 10       3 同床共枕     5 0.000625 1.10  0.000686
## # ... with 25,576 more rows
book_words1 %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(chapter) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = chapter)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~chapter, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

physics_words <- tidybook %>%
    count(chapter, word, sort = TRUE)
plot_physics <- physics_words %>%
  bind_tf_idf(word, chapter, n) %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word))))
 

plot_physics %>% 
  group_by(chapter) %>% 
  top_n(15, tf_idf) %>% 
  ungroup() %>%
  mutate(word = reorder(word, tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = chapter)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~chapter, ncol = 2, scales = "free") +
  coord_flip()

用n-gram標記

library(dplyr)
library(tidytext)
library(janeaustenr)