本小說為描述主角尼克與女主角艾咪,這對夫妻由情人到婚後生活,因相繼失業問題導致婚姻自此急轉直下,後續因主角尼克對婚姻不忠,導致女主角愛咪精心策劃自導自演一場謀殺和懸案。 第一章:主要敘述了尼克和艾咪的婚姻。 第二章:揭露事件真相,尼克對艾咪不忠搞外遇;艾咪則因身無分文投靠前男友戴西家中。尼克則在律師建議下上電視訪談節目,講述對艾咪感情並陳述殺害妻子。愛咪看到節目後,也誤以為尼克真心願意挽回自己。她誘惑戴西和自己發生性關係,趁機將他殺害,並布置成遭他強姦被迫自衛的假象。 第三章:愛咪逃回家裡,編造故事稱自己遭到戴西綁架,被囚禁成為性奴隸。警方和公眾相信了她的說辭,但尼克知道她在撒謊。但由於社會語論壓力下,尼克最終屈服,兩人得以繼續這段婚姻生活。
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(jiebaR)
## Loading required package: jiebaRD
library(gutenbergr)
library(stringr)
library(wordcloud2)
library(wordcloud)
## Loading required package: RColorBrewer
library(ggplot2)
library(tidyr)
library(scales)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
setwd("C:/learning/hw2")
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
orginalDF = read.table("消失的愛人(控制).txt",header= F , sep="\n",fileEncoding = "UTF-8", stringsAsFactors=F)
head(orginalDF)
## V1
## 1 消失的愛人
## 2 [美]吉莉安‧弗琳 著
## 3 胡緋 譯
## 4 中信出版社
## 5 圖書在版編目( CIP)數據
## 6 消失的愛人 /(美)弗琳著;胡緋譯 . —北京:中信出版社, 2013.6
novelDf = data_frame(text =orginalDF[-(1:95),])
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
head(novelDf)
## # A tibble: 6 x 1
## text
## <chr>
## 1 第一部分
## 2 芳蹤難覓
## 3 尼克‧鄧恩
## 4 事發當日
## 5 每當想起我太太,我總會想起她那顆頭顱。最先想起的是輪廓:第一眼見到她時,我望見的就是她的後腦,那頭顱有著某種曼妙之處,好似一粒閃亮堅硬的玉米,要~
## 6 不管在哪兒,我都不會錯認她那顆小腦袋。
novelVector = unlist(strsplit(novelDf$text,"[,。?]"), use.names=FALSE)
head(novelVector)
## [1] "第一部分" "芳蹤難覓" "尼克‧鄧恩"
## [4] "事發當日" "每當想起我太太" "我總會想起她那顆頭顱"
novelDf = data_frame(text=novelVector) %>%
mutate(line=c(1:nrow(.))) %>%
mutate(chapter = cumsum(str_detect(.$text, regex("^第.*部分$"))))
head(novelDf)
## # A tibble: 6 x 3
## text line chapter
## <chr> <int> <int>
## 1 第一部分 1 1
## 2 芳蹤難覓 2 1
## 3 尼克‧鄧恩 3 1
## 4 事發當日 4 1
## 5 每當想起我太太 5 1
## 6 我總會想起她那顆頭顱 6 1
#分析角色
role_tokenizer <- function(source,textdata) {
DiaryNick = data.frame(text=c(""),name=c("OTHER"),chapter="",line=1,stringsAsFactors = F)
DiaryAmy = data.frame(text=c(""),name=c("OTHER"),chapter="",line=1,stringsAsFactors = F)
isNick=FALSE
isAmy=FALSE
count=0
lapply(textdata, function(x) {
count <<- count+1
chapterv = source$chapter[count]
linev = source$line[count]
if (x != "") {
if (str_detect(x, regex("^(尼克‧鄧恩)$"))){
isNick <<- TRUE
isAmy <<- FALSE
}
if (str_detect(x, regex("^(艾米‧艾略特)$"))
| str_detect(x, regex("^(艾米‧艾略特‧鄧恩)$")) ){
isNick <<- FALSE
isAmy <<- TRUE
}
if (isNick==TRUE){
DiaryNick <<- rbind(DiaryNick,data.frame(text=c(x),name=c("NICK"),chapter=chapterv,line =linev,stringsAsFactors = F))
}
if (isAmy==TRUE){
DiaryAmy <<- rbind(DiaryAmy,data.frame(text=c(x),name=c("AMY"),chapter=chapterv,line=linev,stringsAsFactors = F))
}
}
})
DiaryNick <- DiaryNick %>% filter(text !="")
DiaryAmy <- DiaryAmy %>% filter(text !="")
return(list(NICK=DiaryNick ,AMY=DiaryAmy))
}
NickAndAmy <- novelDf %>% filter(text != "") %>% role_tokenizer(.$text)
setwd("C:/learning/hw2")
jieba_tokenizer <- worker(stop_word = "stop_words.txt",user="user_words.txt")
#save(DiaryAll,file="AmyAndNick.RData")
#定義丟給unnest_tokens的分詞
book_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
# 將詞彙長度為1的詞清除
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
tidybook = novelDf %>% unnest_tokens(word,text,token= book_tokenizer)
head(tidybook)
## # A tibble: 6 x 3
## line chapter word
## <int> <int> <chr>
## 1 1 1 第一
## 2 1 1 部分
## 3 2 1 芳蹤
## 4 2 1 難覓
## 5 3 1 尼克‧鄧恩
## 6 4 1 事發
tidybook %>%
count(word, sort = TRUE) %>%
filter(n > 200) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
ylab("出現次數") +
coord_flip()
tokens_count <- tidybook %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
head(tokens_count)
## # A tibble: 6 x 2
## word sum
## <chr> <int>
## 1 艾米 942
## 2 一個 826
## 3 尼克 720
## 4 已經 567
## 5 知道 432
## 6 覺得 295
tokens_count %>% wordcloud2()
#準備LIWC字典
setwd("C:/learning/hw2")
liwc_p <- read_file("C:/learning/hw2/dict/liwc/positive.txt")
liwc_n <- read_file("C:/learning/hw2/dict/liwc/negative.txt")
#切出LIWC情緒的正負評價,製作對照表
positive <- strsplit(liwc_p, "[,]")[[1]]
negative <- strsplit(liwc_n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiment = "positive",stringsAsFactors = F)
negative <- data.frame(word = negative, sentiment = "negative",stringsAsFactors = F)
LIWC_ch = rbind(positive,negative)
head(LIWC_ch)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
result_sentiment = tokens_count %>%
select(word) %>%
inner_join(LIWC_ch)
## Joining, by = "word"
head(result_sentiment)
## # A tibble: 6 x 2
## word sentiment
## <chr> <chr>
## 1 希望 positive
## 2 喜歡 positive
## 3 問題 negative
## 4 朋友 positive
## 5 擔心 negative
## 6 相信 positive
tokens_count %>%
inner_join(LIWC_ch) %>%
select(word,sentiment,sum) %>%
acast(word ~ sentiment,value.var = "sum", fill = 0) %>%
wordcloud::comparison.cloud(random.order=FALSE,colors = c("indianred3", "gray80"),max.words = 108)
## Joining, by = "word"
calsentiment <-tidybook %>%
inner_join(LIWC_ch) %>%
count(chapter = chapter, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentimentx = positive - negative)
## Joining, by = "word"
head(calsentiment)
## # A tibble: 3 x 4
## chapter negative positive sentimentx
## <int> <dbl> <dbl> <dbl>
## 1 1 1179 1586 407
## 2 2 838 1046 208
## 3 3 284 298 14
ggplot(calsentiment, aes(chapter, sentimentx,fill = chapter)) +
geom_col(show.legend = FALSE,width = 0.8) +
xlab("章節")+
ylab("情緒差值")
calsentiment$negative = calsentiment$negative * -1
calsentiment <-calsentiment %>%
gather(key=sentiment , cnt,negative:positive)
head(calsentiment)
## # A tibble: 6 x 4
## chapter sentimentx sentiment cnt
## <int> <dbl> <chr> <dbl>
## 1 1 407 negative -1179
## 2 2 208 negative -838
## 3 3 14 negative -284
## 4 1 407 positive 1586
## 5 2 208 positive 1046
## 6 3 14 positive 298
ggplot(calsentiment, aes(chapter, cnt,fill = sentiment)) +
geom_col(show.legend = FALSE,width = 0.8) +
scale_y_continuous(breaks=seq(-2000, 2000, 200)) +
scale_x_continuous(breaks=seq(0, 3, 1)) +
xlab("章節")+
ylab("情緒值")
NickData = data.frame(NickAndAmy["NICK"])
colnames(NickData) = c("text","name","chapter","line")
AmyData = data.frame(NickAndAmy["AMY"])
colnames(AmyData) = c("text","name","chapter","line")
AmyNickData = rbind(NickData,AmyData)
AmyNickDataTidy = AmyNickData %>% unnest_tokens(word,text,token= book_tokenizer)
Allsentiment <- AmyNickDataTidy %>%
inner_join(LIWC_ch) %>%
count(name,chapter,index = line %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
head(Allsentiment)
## # A tibble: 6 x 6
## name chapter index negative positive sentiment
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AMY 1 5 0 7 7
## 2 AMY 1 6 3 16 13
## 3 AMY 1 7 6 8 2
## 4 AMY 1 8 5 10 5
## 5 AMY 1 9 0 5 5
## 6 AMY 1 17 1 1 0
ggplot(Allsentiment, aes(index, sentiment, fill = chapter)) +
geom_col(show.legend = TRUE) +
ylab("情緒值")+
xlab("行數")+
guides(fill=guide_legend(title="章節"))+
facet_wrap(~name, nrow = 2, scales = "free_y")
temp =tidybook %>%
inner_join(LIWC_ch) %>%
count(chapter = chapter, sentiment)
## Joining, by = "word"
ggplot(temp,aes(x=chapter, y=n,group=sentiment, colour=sentiment)) +
geom_line()
word_count <- tidybook %>%
inner_join(LIWC_ch) %>%
mutate(sentimentValue = case_when(sentiment == 'negative' ~ 1,
sentiment =='positive' ~ -1))
## Joining, by = "word"
word_count %>%
group_by(chapter) %>%
summarise(avg_sentiment = mean(sentimentValue)) %>%
ggplot(aes(x=chapter,y=avg_sentiment)) +
geom_line()