## 红楼梦文本挖掘之数据预处理####
## 主要用于文本文档的读取和构建
## 分析与挖掘R中的人物关系
## 孙玉林;2016年10月31
## 如果在每个段落中人物同时出现,则频数权重加1
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(tm)
## Loading required package: NLP
library(readr)
library(stringr)
library(plyr)
## 读取所需要的文件####
## 读取停用词
filename <- "./数据/我的红楼梦停用词.txt"
mystopwords <- readLines(filename)
## 读取红楼梦
filename <-"./数据/红楼梦UTF82.txt"
Red_dream <- readLines(filename,encoding='UTF-8')
## Warning in readLines(filename, encoding = "UTF-8"): 读'./数据/红楼梦
## UTF82.txt'时最后一行未遂
## 读取人名字典
filename <-"./数据/红楼梦人物.txt"
Red_man <- readLines(filename,encoding='UTF-8')
filename <-"./数据/红楼梦诗词123.txt"
dictionary <- readLines(filename,encoding='UTF-8')
sum(is.element(Red_man,dictionary))
## [1] 699
Red_man <- Red_man[is.element(Red_man,dictionary)]
## 将读入的文档分章节####
#去除空白行
Red_dream <- Red_dream[!is.na(Red_dream)]
## 删除卷数据
juan <- grep(Red_dream,pattern = "^第+.+卷")
Red_dream <- Red_dream[(-juan)]
## 找出每一章节的头部行数和尾部行数
## 每一章节的名字
Red_dreamname <- data.frame(name = Red_dream[grep(Red_dream,pattern = "^第+.+回")],
chapter = 1:120)
## 处理章节名
names <- data.frame(str_split(Red_dreamname$name,pattern = " ",simplify =TRUE))
Red_dreamname$chapter2 <- names$X1
Red_dreamname$Name <- apply(names[,2:3],1,str_c,collapse = ",")
## 每章的开始行数
Red_dreamname$chapbegin<- grep(Red_dream,pattern = "^第+.+回")
## 每章的结束行数
Red_dreamname$chapend <- c((Red_dreamname$chapbegin-1)[-1],length(Red_dream))
## 每章的段落长度
Red_dreamname$chaplen <- Red_dreamname$chapend - Red_dreamname$chapbegin
## 每章的内容
for (ii in 1:nrow(Red_dreamname)) {
## 将内容使用句号连接
chapstrs <- str_c(Red_dream[(Red_dreamname$chapbegin[ii]+1):Red_dreamname$chapend[ii]],collapse = "")
## 剔除不必要的空格
Red_dreamname$content[ii] <- str_replace_all(chapstrs,pattern = "[[:blank:]]",replacement = "")
}
## 每段落的内容
content <- Red_dreamname$content
Red_dreamname$content <- NULL
## 计算每章有多少个字
Red_dreamname$numchars <- nchar(content)
## 根据出现在同一章的座位权重
## 对红楼梦进行分词####
Red_fen <- jiebaR::worker(type = "mix",user = "./数据/红楼梦词典.txt")
Fen_red <- apply_list(as.list(content),Red_fen)
## 去除停用词,使用并行的方法
library(parallel)
cl <- makeCluster(4)
Fen_red <- parLapply(cl = cl,Fen_red, filter_segment,filter_words=mystopwords)
stopCluster(cl)
## 词频统计##-----------------------------------------------------------
## 1:构建文档-词项频数矩阵
corpus <- Corpus(VectorSource(Fen_red))
Red_dtm <- DocumentTermMatrix(corpus,control = list(wordLengths=c(1,Inf)))
Red_dtm
## <<DocumentTermMatrix (documents: 120, terms: 40342)>>
## Non-/sparse entries: 117554/4723486
## Sparsity : 98%
## Maximal term length: 12
## Weighting : term frequency (tf)
## 一共有4万多个词
## 2:词频统计
word_freq <- sort(colSums(as.matrix(Red_dtm)),decreasing = TRUE)
word_freq <- data.frame(word = names(word_freq),freq=word_freq,row.names = NULL)
# word_freq$word <- as.factor(word_freq$word)
##
word_freq <- word_freq[is.element(word_freq$word,Red_man),]
summary(word_freq)
## word freq
## 艾官 : 1 Min. : 1.00
## 安国公 : 1 1st Qu.: 1.00
## 白老媳妇: 1 Median : 3.00
## 板儿 : 1 Mean : 45.47
## 伴鹤 : 1 3rd Qu.: 13.00
## 包勇 : 1 Max. :3907.00
## (Other) :530
hist(word_freq$freq,breaks= 200)

sum(word_freq$freq >10)
## [1] 149
## 只分析出现次数大于10词的人物
Red_man <- as.character(word_freq$word[word_freq$freq >10])
Red_man
## [1] "宝玉" "贾母" "凤姐" "袭人" "王夫人"
## [6] "宝钗" "贾政" "贾琏" "平儿" "薛姨妈"
## [11] "探春" "紫鹃" "鸳鸯" "贾珍" "李纨"
## [16] "尤氏" "晴雯" "刘姥姥" "邢夫人" "薛蟠"
## [21] "林黛玉" "香菱" "麝月" "贾蓉" "贾赦"
## [26] "惜春" "贾芸" "周瑞家的" "芳官" "贾环"
## [31] "妙玉" "雪雁" "迎春" "赵姨娘" "莺儿"
## [36] "宝蟾" "秦钟" "巧姐" "薛蝌" "秋纹"
## [41] "贾兰" "尤二姐" "茗烟" "史湘云" "大了"
## [46] "林之孝家的" "司棋" "五儿" "赖大" "贾瑞"
## [51] "贾蔷" "凤丫头" "林之孝" "兴儿" "焙茗"
## [56] "冯紫英" "琥珀" "包勇" "彩云" "旺儿"
## [61] "翠缕" "门子" "周瑞" "丰儿" "玉钏儿"
## [66] "李贵" "柳家的" "金钏儿" "板儿" "李嬷嬷"
## [71] "倪二" "金荣" "王仁" "甄宝玉" "坠儿"
## [76] "鲍二" "北静王" "秋桐" "藕官" "尤三姐"
## [81] "春燕" "贾芹" "琏二奶奶" "彩屏" "李十儿"
## [86] "侍书" "李纹" "玻璃" "王善保家的" "薛宝钗"
## [91] "翠墨" "张道士" "彩霞" "马道婆" "王太医"
## [96] "王子腾" "柳湘莲" "蕊官" "邢岫烟" "龄官"
## [101] "贾敬" "李绮" "佩凤" "入画" "狗儿"
## [106] "贾雨村" "文官" "邢大舅" "绣桔" "云儿"
## [111] "詹光" "大姐儿" "李宫裁" "颦儿" "贾宝玉"
## [116] "焦大" "水溶" "王一贴" "尤老娘" "元春"
## [121] "智能" "彩明" "花自芳" "青儿" "素云"
## [126] "英莲" "碧痕" "李婶娘" "鲍二家的" "封肃"
## [131] "女先儿" "琪官" "鹦哥" "赵嬷嬷" "赵堂官"
## [136] "贾菌" "空空道人" "赖大家的" "赖尚荣" "王家的"
## [141] "旺儿媳妇" "吴新登" "俞禄" "锄药" "金钏"
## [146] "赖嬷嬷" "秋菱" "小幺儿" "周姨娘"
## 生成两两人物的所有组合
Red_mansol <- t(combn(Red_man,2,simplify = FALSE))
Red_mansol <- plyr::ldply(Red_mansol)
names(Red_mansol) <- c("First","Second")
## 判断每个组合在文档中出现的次数
## 函数
timesFre <- function(strss,fencisol){
strs <- as.character(strss)
# strs
aa <- lapply(fencisol,is.element,el = strs)
aa <- lapply(aa,sum)
aa <- ifelse(aa == 2,1,0)
# weight <- sum(aa)
return(sum(aa))
}
timesFre(Red_mansol[1,],fencisol = Fen_red)
## [1] 98
system.time({
weights <- apply(Red_mansol[1:100,],1,timesFre,fencisol = Fen_red)
})
## user system elapsed
## 6.550 0.036 6.601
system.time({
weights <- apply(Red_mansol,1,timesFre,fencisol = Fen_red)
})
## user system elapsed
## 663.097 1.786 665.310
hist(weights)

# ## 判断每个组合在文档中出现的次数
# weights <- vector(mode = "numeric", length = nrow(Red_mansol))
# for (ii in 1:nrow(Red_mansol)) {
# strs <- as.character(Red_mansol[ii,1:2])
# weight <- 0
# for(kk in 1:length(Fen_red)){
# weight <- ifelse(sum(is.element(strs,Fen_red[[kk]])) == 2,weight+1,weight)
# }
# weights[ii] <- weight
# }
# summary(weights)
# table(weights)
# weights <- vector(mode = "numeric", length = nrow(Red_mansol))
# # ii <- 5159
# for (ii in 1:nrow(Red_mansol)) {
# strs <- as.character(Red_mansol[ii,1:2])
# # strs
# aa <- lapply(Fen_red,is.element,el = strs)
# aa <- lapply(aa,sum)
# aa <- ifelse(aa == 2,1,0)
# weights[ii] <- sum(aa)
# }
# summary(weights)
# table(weights)
Red_mansol$chapweight <- weights
## 根据出现在同一断落中的频数权重
## 对红楼梦进行分词####
Red_fen <- jiebaR::worker(type = "mix",user = "./数据/红楼梦词典.txt")
Fen_red <- apply_list(as.list(Red_dream),Red_fen)
## 去除停用词,使用并行的方法
library(parallel)
cl <- makeCluster(4)
Fen_red <- parLapply(cl = cl,Fen_red, filter_segment,filter_words=mystopwords)
stopCluster(cl)
## 计算权重
# weights <- vector(mode = "numeric", length = nrow(Red_mansol))
# # ii <- 5159
# for (ii in 1:nrow(Red_mansol)) {
# strs <- as.character(Red_mansol[ii,1:2])
# # strs
# aa <- lapply(Fen_red,is.element,el = strs)
# aa <- lapply(aa,sum)
# aa <- ifelse(aa == 2,1,0)
# weights[ii] <- sum(aa)
# }
# summary(weights)
# table(weights)
weights <- apply(Red_mansol,1,timesFre,fencisol = Fen_red)
hist(weights)

Red_mansol$duanweight <- weights
write_csv(Red_mansol,"./数据/社交网络权重.csv")