## sougou用户画像数据分析与数据挖掘####
## 对分词后的结果进行处理,数据探索性分析
##设置工作文件夹
setwd("/Users/Daitu/数据分析/CCF/Sogou画像")
getwd()
## [1] "/Users/daitu/数据分析/CCF/Sogou画像"
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(parallel)
library(tm)
## Loading required package: NLP
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)
## 加载数据
load("训练集分词结果.RData")
load("训练集分词标记.RData")
load("预测集分词标记.RData")
load("预测集分词结果.RData")
## 将分词后的结果转化为文档词条矩阵 #####
## 针对训练集
corpus = Corpus(VectorSource(results))
train_dtm = DocumentTermMatrix(corpus,control = list(wordLengths = c(1, Inf)))# 文档-词矩阵
## 查看词项数目
train_dtm
## <<DocumentTermMatrix (documents: 17663, terms: 350634)>>
## Non-/sparse entries: 3549676/6189698666
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## 该步骤维简化稀疏矩阵#####
# Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
train_dtmr <- removeSparseTerms(train_dtm, 0.998)
## Warning in nr * nc: 整数上溢产生了NA
train_dtmr
## <<DocumentTermMatrix (documents: 17663, terms: 14564)>>
## Non-/sparse entries: 2477031/254766901
## Sparsity : 99%
## Maximal term length: 12
## Weighting : term frequency (tf)
# train_dtmr$dimnames$Terms
#查看高频词汇
freq.terms <- sort(colSums(as.matrix(train_dtmr)),decreasing = TRUE)
freq.terms <- tbl_df(data.frame(name = names(freq.terms),fre = freq.terms))
names(freq.terms) <- c("word","freq")
freq.terms$word <- as.factor(freq.terms$word)
summary(freq.terms)
## word freq
## 0.1 : 1 Min. : 36.0
## 0.5 : 1 1st Qu.: 79.0
## 007 : 1 Median : 129.0
## 1.0 : 1 Mean : 330.2
## 1.1 : 1 3rd Qu.: 265.0
## 1.2 : 1 Max. :48922.0
## (Other):14558
## 绘制词频图
nn <- 6000
freq.terms[freq.terms$freq>nn,] %>%
ggplot(aes(x = word, y = freq)) +
theme_bw(base_family = "STKaiti") +
geom_bar(stat="identity",fill = "lightblue") +
scale_x_discrete() +
theme(axis.text.x=element_text(angle=75, hjust=1,size = 9))+
labs(x = "词项",y = "频数",title = "词频图")

## 词云######
sum(freq.terms$freq > 2000)
## [1] 311
set.seed(375) # to make it reproducible
grayLevels <- gray( (freq.terms$freq+10) / (max(freq.terms$freq)+10) )
par(family = "STKaiti",mfrow = c(1,1))
wordcloud(words=freq.terms$word, freq=freq.terms$freq,
min.freq=2000, scale=c(4,0.5),random.color=TRUE,
random.order=FALSE,colors=grayLevels)

wordcloud(words=freq.terms$word, freq=freq.terms$freq,random.order = FALSE,
random.color = FALSE,min.freq=2000,
scale=c(3,0.65), colors=brewer.pal(6,"Dark2"),family = "STKaiti")

data.frame(freq.terms[freq.terms$freq>2000,]) %>%
wordcloud2::wordcloud2(fontWeight = 'normal',
color = 'random-dark',backgroundColor = "gray")
## 自定义图片词云
figPath = system.file("examples/t.png",package = "wordcloud2")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
wordcloud2::wordcloud2(figPath = figPath,
size = 1,color = "skyblue")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
letterCloud("RSG",wordSize = 8.5)
## 针对预测集#####------------------------------------
corpus = Corpus(VectorSource(resultstest))
test_dtm = DocumentTermMatrix(corpus,control = list(wordLengths = c(1, Inf))) # 文档-词矩阵
## 查看词项数目
test_dtm
## <<DocumentTermMatrix (documents: 20000, terms: 374521)>>
## Non-/sparse entries: 3993022/7486426978
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## 该步骤维简化稀疏矩阵#####
# Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
test_dtmr <- removeSparseTerms(test_dtm, 0.998)
## Warning in nr * nc: 整数上溢产生了NA
test_dtmr
## <<DocumentTermMatrix (documents: 20000, terms: 14395)>>
## Non-/sparse entries: 2776507/285123493
## Sparsity : 99%
## Maximal term length: 12
## Weighting : term frequency (tf)
# test_dtmr$dimnames$Terms
#查看高频词汇
freq.terms <- sort(colSums(as.matrix(test_dtmr)),decreasing = TRUE)
freq.terms <- tbl_df(data.frame(name = names(freq.terms),fre = freq.terms))
names(freq.terms) <- c("word","freq")
freq.terms$word <- as.factor(freq.terms$word)
summary(freq.terms)
## word freq
## 0.5 : 1 Min. : 41
## 007 : 1 1st Qu.: 90
## 1.0 : 1 Median : 147
## 1.1 : 1 Mean : 375
## 1.2 : 1 3rd Qu.: 300
## 1.3 : 1 Max. :56649
## (Other):14389
## 绘制词频图
nn <- 6000
freq.terms[freq.terms$freq>nn,] %>%
ggplot(aes(x = word, y = freq)) +
theme_bw(base_family = "STKaiti") +
geom_bar(stat="identity",fill = "lightblue") +
scale_x_discrete() +
theme(axis.text.x=element_text(angle=75, hjust=1,size = 9))+
labs(x = "词项",y = "频数",title = "词频图")

## 词云######
sum(freq.terms$freq > 2000)
## [1] 364
set.seed(375) # to make it reproducible
grayLevels <- gray( (freq.terms$freq+10) / (max(freq.terms$freq)+10) )
par(family = "STKaiti",mfrow = c(1,1))
wordcloud(words=freq.terms$word, freq=freq.terms$freq,
min.freq=2000, scale=c(4,0.5),random.color=TRUE,
random.order=FALSE,colors=grayLevels)

wordcloud(words=freq.terms$word, freq=freq.terms$freq,random.order = FALSE,
random.color = FALSE,min.freq=2000,
scale=c(3,0.65), colors=brewer.pal(6,"Dark2"),family = "STKaiti")

data.frame(freq.terms[freq.terms$freq>2000,]) %>%
wordcloud2::wordcloud2(fontWeight = 'normal',
color = 'random-dark',backgroundColor = "gray")
## 自定义图片词云
figPath = system.file("examples/t.png",package = "wordcloud2")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
wordcloud2::wordcloud2(figPath = figPath,
size = 1,color = "skyblue")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
letterCloud("RSG",wordSize = 8.5)