## 红楼梦文本挖掘
## 主要用于文本文档的读取和构建


setwd("/Users/daitu/数据分析/订单/Red Dream2")

getwd()
## [1] "/Users/daitu/数据分析/订单/Red Dream2"
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(tm)
## Loading required package: NLP
library(readr)
library(stringr)
library(plyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud2)
library(ggraph)
library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 读取所需要的文件####
## 读取停用词
mystopwords <- readLines("停用词.txt")
## 读取红楼梦
dream <- readLines("红楼梦01.txt",encoding='UTF-8')
## 读取人名字典
man <- readLines("红楼梦人物.txt",encoding='UTF-8')

## 将读入的文档分章节####
#去除空白行
dream <- dream[!nchar(dream) == 0]
## 删除卷数据
## 找出每一章节的头部行数和尾部行数
## 每一章节的名字
dreamname <- data.frame(name = dream[grep(dream,pattern = "^第+.+回")],
                            chapter = 1:120)
## 处理章节名
names <- data.frame(str_split(dreamname$name,pattern = " ",simplify =TRUE))
dreamname$chapter2 <- names$X1
dreamname$Name <- apply(names[,2:3],1,str_c,collapse = ",")
## 每章的开始行数
dreamname$chapbegin<- grep(dream,pattern = "^第+.+回")
## 每章的结束行数
dreamname$chapend <- c((dreamname$chapbegin-1)[-1],length(dream))

## 每章的内容
for (ii in 1:nrow(dreamname)) {
  ## 将内容使用句号连接
  chapstrs <- str_c(dream[(dreamname$chapbegin[ii]+1):dreamname$chapend[ii]],collapse = "")
  ## 剔除不必要的空格
  dreamname$content[ii] <- str_replace_all(chapstrs,pattern = "[[:blank:]]",replacement = "")
}
## 每段落的内容
content <- dreamname$content
dreamname$content <- NULL
## 对红楼梦进行分词####
fen <- jiebaR::worker(type = "query",user = "红楼梦词典.txt")
red <- apply_list(as.list(content),fen)
## 去除停用词
red <- lapply(red, filter_segment,filter_words=mystopwords)
red[[1]][1:100]
##   [1] "开卷"     "第一回"   "作者"     "自云"     "一番"     "梦幻"    
##   [7] "真事"     "隐去"     "而借"     "撰此"     "石头记"   "一书"    
##  [13] "故曰"     "甄士隐"   "但书中"   "所记"     "何事"     "何人"    
##  [19] "风尘碌碌" "一事无成" "念及"     "当日"     "女子"     "一一"    
##  [25] "细考"     "觉其"     "行止"     "见识"     "之上"     "堂堂"    
##  [31] "须眉"     "诚不若"   "裙钗"     "有余"     "无益"     "之大"    
##  [37] "无可如何" "之日也"   "自欲"     "已往"     "所赖"     "天恩祖"  
##  [43] "锦衣"     "饫甘餍肥" "之日"     "父兄"     "教育"     "之恩"    
##  [49] "师友"     "规谈"     "之德"     "今日"     "一技无成" "半生"    
##  [55] "潦倒"     "之罪"     "编述"     "一集"     "以告"     "天下人"  
##  [61] "闺阁"     "中本"     "历历"     "有人"     "不肖"     "自护己"  
##  [67] "一并"     "泯灭"     "今日"     "茅椽蓬牖" "瓦灶绳床" "晨夕"    
##  [73] "风露"     "阶柳庭花" "未有"     "襟怀笔墨" "未学"     "下笔"    
##  [79] "无文"     "假语村言" "演出"     "一段"     "故事"     "亦可"    
##  [85] "闺阁"     "昭传"     "复可悦"   "世之目"   "破人"     "愁闷"    
##  [91] "宜乎"     "故曰"     "贾雨村"   "此回"     "中凡用"   "提醒"    
##  [97] "阅者"     "眼目"     "此书"     "立意"
## 词频统计##-----------------------------------------------------------
## 构建文档-词项频数矩阵
c <- Corpus(VectorSource(red))
dtm <- DocumentTermMatrix(c,control = list(wordLengths=c(1,Inf)))
dtm
## <<DocumentTermMatrix (documents: 120, terms: 41263)>>
## Non-/sparse entries: 119977/4831583
## Sparsity           : 98%
## Maximal term length: 12
## Weighting          : term frequency (tf)
## 词频统计
freq <- sort(colSums(as.matrix(dtm)),decreasing = TRUE)
freq <- data.frame(word = names(freq),freq=freq,row.names = NULL)
freq$word <- as.factor(freq$word)
head(freq)
##   word freq
## 1 宝玉 3908
## 2 笑道 1955
## 3 贾母 1686
## 4 一个 1456
## 5 凤姐 1228
## 6 袭人 1154
## 绘制词频图
nn <- 250
sum(freq$freq>=nn)
## [1] 69
freq[freq$freq >= nn,] %>%
  ggplot(aes(x = word,y = freq)) +
  theme_gray(base_size = 12,base_family = "STKaiti") +
  geom_bar(stat = "identity",fill= "red",colour = "blue",alpha = 1) +
  scale_x_discrete() +
  theme(axis.text.x = element_text(angle = 90,hjust = 1)) +
  labs(x = "Word",y = "freq",title = "Red Dream")

## 词云
sum(freq$freq>=60)
## [1] 393
data.frame(freq[freq$freq>60,]) %>%
  letterCloud("R",wordSize = 12)
## 查看红楼梦中人物的出现频数----------------------------------
sum(freq$word %in% man)
## [1] 600
man_freq <- freq[freq$word %in% man,]
summary(man_freq)
##        word          freq        
##  艾官    :  1   Min.   :   1.00  
##  安国公  :  1   1st Qu.:   1.00  
##  白老媳妇:  1   Median :   3.00  
##  板儿    :  1   Mean   :  42.36  
##  伴鹤    :  1   3rd Qu.:  12.00  
##  包勇    :  1   Max.   :3908.00  
##  (Other) :594
## 人物的词频和词云
nn <- 100
sum(man_freq$freq>=nn)
## [1] 42
man_freq[man_freq$freq >= nn,] %>%
  ggplot(aes(x = reorder(word,freq),y = freq)) +
  theme_gray(base_size = 12,base_family = "STKaiti") +
  geom_bar(stat = "identity",fill= "red",colour = "blue",alpha = 1) +
  theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 9)) +
  labs(x = "Word",y = "freq",title = "Red Dream")

## 关键人物的词云图
data.frame(man_freq[man_freq$freq>10,]) %>%
  wordcloud2(color = 'random-dark',backgroundColor = "white",
             shape =  'diamond' )
## 聚类分析
dtm_tfr <- removeSparseTerms(dtm,0.95)
dtm_tfr
## <<DocumentTermMatrix (documents: 120, terms: 3153)>>
## Non-/sparse entries: 60194/318166
## Sparsity           : 84%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## 只留下了3000多个关键的字

## 使用系统聚类对每个章节进行聚类
dtmmat <- as.matrix(dtm_tfr)

## 文本间的距离度量为夹角余弦距离
dtmmat_dist <- proxy::dist(dtmmat,method ="cosine")

clust <- hclust(d = dtmmat_dist,method = "ward.D2")
clust
## 
## Call:
## hclust(d = dtmmat_dist, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : cosine 
## Number of objects: 120
## 
Den <- as.dendrogram(clust)
## 添加节点的章节分组信息
dreamname$Group <- factor(rep(c(1,2),times = c(80,40)),
                              labels = c("F80","H40"))

Den <- dendrapply(Den, function(d) {
  if(is.leaf(d)) 
    attr(d, 'nodePar') <- list(Group=dreamname[as.integer(attr(d, 'label')),7])
  d
})

# Plotting this looks very much like ggplot2 except for the new geoms
ggraph(graph = Den, layout = 'dendrogram', repel = TRUE, circular = TRUE, 
       ratio = 0.5) + 
  geom_edge_elbow() + 
  geom_node_text(aes(x = x*1.05, y=y*1.05, filter=leaf, 
                     angle = nAngle(x, y), label = label), 
                 size=3, hjust='outward') + 
  geom_node_point(aes(filter=leaf, color=Group)) + 
  coord_fixed() + 
  ggforce::theme_no_axes() 

## 判断作者不是一个人
dtm_tfr <- removeSparseTerms(dtm,0.9)
dtm_tfr
## <<DocumentTermMatrix (documents: 120, terms: 1674)>>
## Non-/sparse entries: 47645/153235
## Sparsity           : 76%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## 创造一个容器,SLDA模型
trainsize <- c(1:60,80:100)
testsize <- c(61:79,101:120)
containersvm <- create_container(dtm_tfr,labels = dreamname$Group,
                                 trainSize = trainsize,
                                 testSize = testsize,
                                 virgin=FALSE)

## 训练模型
system.time({
  GLMNET <- train_model(containersvm,"GLMNET")
})
##    user  system elapsed 
##   0.245   0.003   0.254
## Classifying data using trained models
system.time({
  GLMNET_class <- classify_model(containersvm,GLMNET)
})
##    user  system elapsed 
##   0.096   0.001   0.098
## 分析测试结果

table(dreamname$Group[testsize],GLMNET_class$GLMNET_LABEL)
##      
##       F80 H40
##   F80  18   1
##   H40   4  16
## 可以看出前80章的大部分分到了前80章这一组,后40章的大部分分到了后40章的这一组