控制（Gone Girl）

本小說為描述主角尼克與女主角艾咪，這對夫妻由情人到婚後生活，因相繼失業問題導致婚姻自此急轉直下，後續因主角尼克對婚姻不忠，導致女主角愛咪精心策劃自導自演一場謀殺和懸案。
第一章：主要敘述了尼克和艾咪的婚姻。
第二章：揭露事件真相，尼克對艾咪不忠搞外遇;艾咪則因身無分文投靠前男友戴西家中。尼克則在律師建議下上電視訪談節目，講述對艾咪感情並陳述殺害妻子。愛咪看到節目後，也誤以為尼克真心願意挽回自己。她誘惑戴西和自己發生性關係，趁機將他殺害，並布置成遭他強姦被迫自衛的假象。
第三章：愛咪逃回家裡，編造故事稱自己遭到戴西綁架，被囚禁成為性奴隸。警方和公眾相信了她的說辭，但尼克知道她在撒謊。但由於社會語論壓力下，尼克最終屈服，兩人得以繼續這段婚姻生活。

載入套件

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(jiebaR)

## Loading required package: jiebaRD

library(stringr)
library(ggplot2)
library(tidyr)
library(scales)

載入小說

setwd("C:/learning/hw2")
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

orginalDF = read.table("消失的愛人(控制).txt",header= F ,sep="\n",fileEncoding = "UTF-8", stringsAsFactors=F)
head(orginalDF)

##                                                              V1
## 1                                                    消失的愛人
## 2                                            [美]吉莉安‧弗琳 著
## 3                                                       胡緋 譯
## 4                                                    中信出版社
## 5                                      圖書在版編目（ CIP）數據
## 6 消失的愛人 /（美）弗琳著；胡緋譯 . —北京：中信出版社， 2013.6

原始資料過濾及分析

過濾評論及其他等前95行內容

novelDf = data_frame(text =orginalDF[-(1:95),])

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

head(novelDf)

## # A tibble: 6 x 1
##   text                                                                     
##   <chr>                                                                    
## 1 第一部分                                                                 
## 2 芳蹤難覓                                                                 
## 3 尼克‧鄧恩                                                                
## 4 事發當日                                                                 
## 5 每當想起我太太，我總會想起她那顆頭顱。最先想起的是輪廓：第一眼見到她時，我望見的就是她的後腦，那頭顱有著某種曼妙之處，好似一粒閃亮堅硬的玉米，要~
## 6 不管在哪兒，我都不會錯認她那顆小腦袋。

以逗號、句號、問號作為斷句的規則

novelVector =  unlist(strsplit(novelDf$text,"[，。？]"), use.names=FALSE) 
head(novelVector)

## [1] "第一部分"             "芳蹤難覓"             "尼克‧鄧恩"           
## [4] "事發當日"             "每當想起我太太"       "我總會想起她那顆頭顱"

將上一步驟取得Vector轉為DataFrame並增加欄位「line」與「chapter」

novelDf = data_frame(text=novelVector)  %>%
  mutate(line=c(1:nrow(.))) %>% 
  mutate(chapter = cumsum(str_detect(.$text, regex("^第.*部分$"))))
head(novelDf)

## # A tibble: 6 x 3
##   text                  line chapter
##   <chr>                <int>   <int>
## 1 第一部分                 1       1
## 2 芳蹤難覓                 2       1
## 3 尼克‧鄧恩                3       1
## 4 事發當日                 4       1
## 5 每當想起我太太           5       1
## 6 我總會想起她那顆頭顱     6       1

結巴分詞 function

定義結巴分詞需要的字詞庫及 function

setwd("C:/learning/hw2")
jieba_tokenizer <- worker(stop_word = "stop_words.txt",user="user_words.txt")
#save(DiaryAll,file="AmyAndNick.RData")
#定義丟給unnest_tokens的分詞
book_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    # 將詞彙長度為1的詞清除
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

透過unnest_tokens斷詞功能，可以保留斷詞欄位以外欄位資料，例如章節

tidybook = novelDf %>% unnest_tokens(word,text,token= book_tokenizer)  
head(tidybook)

## # A tibble: 6 x 3
##    line chapter word     
##   <int>   <int> <chr>    
## 1     1       1 第一     
## 2     1       1 部分     
## 3     2       1 芳蹤     
## 4     2       1 難覓     
## 5     3       1 尼克‧鄧恩
## 6     4       1 事發

統計每章節每個詞出現的次數

book_words1 <- tidybook %>%
    count(chapter, word, sort = TRUE)

新增每章節總字數欄位

total_words1 <- book_words1 %>% 
  group_by(chapter) %>% 
  summarize(total = sum(n))

book_words1 <- left_join(book_words1, total_words1)

## Joining, by = "chapter"

book_words1

## # A tibble: 25,589 x 4
##    chapter word      n total
##      <int> <chr> <int> <int>
##  1       1 艾米    501 43930
##  2       1 一個    494 43930
##  3       1 尼克    332 43930
##  4       2 尼克    293 28349
##  5       1 已經    288 43930
##  6       2 艾米    286 28349
##  7       2 一個    242 28349
##  8       1 知道    215 43930
##  9       2 已經    204 28349
## 10       1 彷彿    181 43930
## # ... with 25,579 more rows

每章每個字詞次數與總字數相除並畫長條圖

ggplot(book_words1, aes(n/total, fill = chapter)) +
  geom_histogram(show.legend = FALSE)  +
  facet_wrap(~chapter, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

增加rank及詞性頻率

freq_by_rank <- book_words1 %>% 
  group_by(chapter) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank

## # A tibble: 25,589 x 6
## # Groups:   chapter [3]
##    chapter word      n total  rank `term frequency`
##      <int> <chr> <int> <int> <int>            <dbl>
##  1       1 艾米    501 43930     1          0.0114 
##  2       1 一個    494 43930     2          0.0112 
##  3       1 尼克    332 43930     3          0.00756
##  4       2 尼克    293 28349     1          0.0103 
##  5       1 已經    288 43930     4          0.00656
##  6       2 艾米    286 28349     2          0.0101 
##  7       2 一個    242 28349     3          0.00854
##  8       1 知道    215 43930     5          0.00489
##  9       2 已經    204 28349     4          0.00720
## 10       1 彷彿    181 43930     6          0.00412
## # ... with 25,579 more rows

畫出rank及詞性頻率的關係線圖

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = chapter)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

examine Zipf’s law

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)

## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -1.6717      -0.6553

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = chapter)) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

bind_tf_idf

每章節前15個重要的字詞，畫視覺圖

book_words1 <- book_words1 %>%
  bind_tf_idf(word, chapter, n)
book_words1

## # A tibble: 25,589 x 7
##    chapter word      n total      tf   idf tf_idf
##      <int> <chr> <int> <int>   <dbl> <dbl>  <dbl>
##  1       1 艾米    501 43930 0.0114      0      0
##  2       1 一個    494 43930 0.0112      0      0
##  3       1 尼克    332 43930 0.00756     0      0
##  4       2 尼克    293 28349 0.0103      0      0
##  5       1 已經    288 43930 0.00656     0      0
##  6       2 艾米    286 28349 0.0101      0      0
##  7       2 一個    242 28349 0.00854     0      0
##  8       1 知道    215 43930 0.00489     0      0
##  9       2 已經    204 28349 0.00720     0      0
## 10       1 彷彿    181 43930 0.00412     0      0
## # ... with 25,579 more rows

book_words1 %>%
  select(-total) %>%
  arrange(desc(tf_idf))

## # A tibble: 25,589 x 6
##    chapter word         n       tf   idf   tf_idf
##      <int> <chr>    <int>    <dbl> <dbl>    <dbl>
##  1       3 返家        11 0.00137  1.10  0.00151 
##  2       2 莎朗        35 0.00123  1.10  0.00136 
##  3       3 短片         9 0.00112  1.10  0.00124 
##  4       2 多蘿西      26 0.000917 1.10  0.00101 
##  5       2 傑夫        69 0.00243  0.405 0.000987
##  6       3 強暴         7 0.000874 1.10  0.000961
##  7       3 證據        17 0.00212  0.405 0.000861
##  8       1 酒吧        93 0.00212  0.405 0.000858
##  9       2 貝爾        18 0.000635 1.10  0.000698
## 10       3 同床共枕     5 0.000625 1.10  0.000686
## # ... with 25,579 more rows

book_words1 %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(chapter) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = chapter)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~chapter, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

0408 tf-idf

group4

2019年4月8日