## 红楼梦文本挖掘
## 主要用于文本文档的读取和构建
setwd("/Users/daitu/数据分析/订单/Red Dream2")
getwd()
## [1] "/Users/daitu/数据分析/订单/Red Dream2"
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(tm)
## Loading required package: NLP
library(readr)
library(stringr)
library(plyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud2)
library(ggraph)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## 读取所需要的文件####
## 读取停用词
mystopwords <- readLines("停用词.txt")
## 读取红楼梦
dream <- readLines("红楼梦01.txt",encoding='UTF-8')
## 读取人名字典
man <- readLines("红楼梦人物.txt",encoding='UTF-8')
## 将读入的文档分章节####
#去除空白行
dream <- dream[!nchar(dream) == 0]
## 删除卷数据
## 找出每一章节的头部行数和尾部行数
## 每一章节的名字
dreamname <- data.frame(name = dream[grep(dream,pattern = "^第+.+回")],
chapter = 1:120)
## 处理章节名
names <- data.frame(str_split(dreamname$name,pattern = " ",simplify =TRUE))
dreamname$chapter2 <- names$X1
dreamname$Name <- apply(names[,2:3],1,str_c,collapse = ",")
## 每章的开始行数
dreamname$chapbegin<- grep(dream,pattern = "^第+.+回")
## 每章的结束行数
dreamname$chapend <- c((dreamname$chapbegin-1)[-1],length(dream))
## 每章的内容
for (ii in 1:nrow(dreamname)) {
## 将内容使用句号连接
chapstrs <- str_c(dream[(dreamname$chapbegin[ii]+1):dreamname$chapend[ii]],collapse = "")
## 剔除不必要的空格
dreamname$content[ii] <- str_replace_all(chapstrs,pattern = "[[:blank:]]",replacement = "")
}
## 每段落的内容
content <- dreamname$content
dreamname$content <- NULL
## 对红楼梦进行分词####
fen <- jiebaR::worker(type = "query",user = "红楼梦词典.txt")
red <- apply_list(as.list(content),fen)
## 去除停用词
red <- lapply(red, filter_segment,filter_words=mystopwords)
red[[1]][1:100]
## [1] "开卷" "第一回" "作者" "自云" "一番" "梦幻"
## [7] "真事" "隐去" "而借" "撰此" "石头记" "一书"
## [13] "故曰" "甄士隐" "但书中" "所记" "何事" "何人"
## [19] "风尘碌碌" "一事无成" "念及" "当日" "女子" "一一"
## [25] "细考" "觉其" "行止" "见识" "之上" "堂堂"
## [31] "须眉" "诚不若" "裙钗" "有余" "无益" "之大"
## [37] "无可如何" "之日也" "自欲" "已往" "所赖" "天恩祖"
## [43] "锦衣" "饫甘餍肥" "之日" "父兄" "教育" "之恩"
## [49] "师友" "规谈" "之德" "今日" "一技无成" "半生"
## [55] "潦倒" "之罪" "编述" "一集" "以告" "天下人"
## [61] "闺阁" "中本" "历历" "有人" "不肖" "自护己"
## [67] "一并" "泯灭" "今日" "茅椽蓬牖" "瓦灶绳床" "晨夕"
## [73] "风露" "阶柳庭花" "未有" "襟怀笔墨" "未学" "下笔"
## [79] "无文" "假语村言" "演出" "一段" "故事" "亦可"
## [85] "闺阁" "昭传" "复可悦" "世之目" "破人" "愁闷"
## [91] "宜乎" "故曰" "贾雨村" "此回" "中凡用" "提醒"
## [97] "阅者" "眼目" "此书" "立意"
## 词频统计##-----------------------------------------------------------
## 构建文档-词项频数矩阵
c <- Corpus(VectorSource(red))
dtm <- DocumentTermMatrix(c,control = list(wordLengths=c(1,Inf)))
dtm
## <<DocumentTermMatrix (documents: 120, terms: 41263)>>
## Non-/sparse entries: 119977/4831583
## Sparsity : 98%
## Maximal term length: 12
## Weighting : term frequency (tf)
## 词频统计
freq <- sort(colSums(as.matrix(dtm)),decreasing = TRUE)
freq <- data.frame(word = names(freq),freq=freq,row.names = NULL)
freq$word <- as.factor(freq$word)
head(freq)
## word freq
## 1 宝玉 3908
## 2 笑道 1955
## 3 贾母 1686
## 4 一个 1456
## 5 凤姐 1228
## 6 袭人 1154
## 绘制词频图
nn <- 250
sum(freq$freq>=nn)
## [1] 69
freq[freq$freq >= nn,] %>%
ggplot(aes(x = word,y = freq)) +
theme_gray(base_size = 12,base_family = "STKaiti") +
geom_bar(stat = "identity",fill= "red",colour = "blue",alpha = 1) +
scale_x_discrete() +
theme(axis.text.x = element_text(angle = 90,hjust = 1)) +
labs(x = "Word",y = "freq",title = "Red Dream")

## 词云
sum(freq$freq>=60)
## [1] 393
data.frame(freq[freq$freq>60,]) %>%
letterCloud("R",wordSize = 12)
## 查看红楼梦中人物的出现频数----------------------------------
sum(freq$word %in% man)
## [1] 600
man_freq <- freq[freq$word %in% man,]
summary(man_freq)
## word freq
## 艾官 : 1 Min. : 1.00
## 安国公 : 1 1st Qu.: 1.00
## 白老媳妇: 1 Median : 3.00
## 板儿 : 1 Mean : 42.36
## 伴鹤 : 1 3rd Qu.: 12.00
## 包勇 : 1 Max. :3908.00
## (Other) :594
## 人物的词频和词云
nn <- 100
sum(man_freq$freq>=nn)
## [1] 42
man_freq[man_freq$freq >= nn,] %>%
ggplot(aes(x = reorder(word,freq),y = freq)) +
theme_gray(base_size = 12,base_family = "STKaiti") +
geom_bar(stat = "identity",fill= "red",colour = "blue",alpha = 1) +
theme(axis.text.x = element_text(angle = 90,hjust = 1,size = 9)) +
labs(x = "Word",y = "freq",title = "Red Dream")

## 关键人物的词云图
data.frame(man_freq[man_freq$freq>10,]) %>%
wordcloud2(color = 'random-dark',backgroundColor = "white",
shape = 'diamond' )
## 聚类分析
dtm_tfr <- removeSparseTerms(dtm,0.95)
dtm_tfr
## <<DocumentTermMatrix (documents: 120, terms: 3153)>>
## Non-/sparse entries: 60194/318166
## Sparsity : 84%
## Maximal term length: 6
## Weighting : term frequency (tf)
## 只留下了3000多个关键的字
## 使用系统聚类对每个章节进行聚类
dtmmat <- as.matrix(dtm_tfr)
## 文本间的距离度量为夹角余弦距离
dtmmat_dist <- proxy::dist(dtmmat,method ="cosine")
clust <- hclust(d = dtmmat_dist,method = "ward.D2")
clust
##
## Call:
## hclust(d = dtmmat_dist, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : cosine
## Number of objects: 120
##
Den <- as.dendrogram(clust)
## 添加节点的章节分组信息
dreamname$Group <- factor(rep(c(1,2),times = c(80,40)),
labels = c("F80","H40"))
Den <- dendrapply(Den, function(d) {
if(is.leaf(d))
attr(d, 'nodePar') <- list(Group=dreamname[as.integer(attr(d, 'label')),7])
d
})
# Plotting this looks very much like ggplot2 except for the new geoms
ggraph(graph = Den, layout = 'dendrogram', repel = TRUE, circular = TRUE,
ratio = 0.5) +
geom_edge_elbow() +
geom_node_text(aes(x = x*1.05, y=y*1.05, filter=leaf,
angle = nAngle(x, y), label = label),
size=3, hjust='outward') +
geom_node_point(aes(filter=leaf, color=Group)) +
coord_fixed() +
ggforce::theme_no_axes()

## 判断作者不是一个人
dtm_tfr <- removeSparseTerms(dtm,0.9)
dtm_tfr
## <<DocumentTermMatrix (documents: 120, terms: 1674)>>
## Non-/sparse entries: 47645/153235
## Sparsity : 76%
## Maximal term length: 6
## Weighting : term frequency (tf)
## 创造一个容器,SLDA模型
trainsize <- c(1:60,80:100)
testsize <- c(61:79,101:120)
containersvm <- create_container(dtm_tfr,labels = dreamname$Group,
trainSize = trainsize,
testSize = testsize,
virgin=FALSE)
## 训练模型
system.time({
GLMNET <- train_model(containersvm,"GLMNET")
})
## user system elapsed
## 0.245 0.003 0.254
## Classifying data using trained models
system.time({
GLMNET_class <- classify_model(containersvm,GLMNET)
})
## user system elapsed
## 0.096 0.001 0.098
## 分析测试结果
table(dreamname$Group[testsize],GLMNET_class$GLMNET_LABEL)
##
## F80 H40
## F80 18 1
## H40 4 16
## 可以看出前80章的大部分分到了前80章这一组,后40章的大部分分到了后40章的这一组