## sougou用户画像数据分析与数据挖掘####
## 对分词后的结果进行处理,数据探索性分析



##设置工作文件夹
setwd("/Users/Daitu/数据分析/CCF/Sogou画像")
getwd()
## [1] "/Users/daitu/数据分析/CCF/Sogou画像"
## 加载所需要的包
library(jiebaR)
## Loading required package: jiebaRD
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(parallel)
library(tm)
## Loading required package: NLP
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(wordcloud2)

## 加载数据
load("训练集分词结果.RData")
load("训练集分词标记.RData")
load("预测集分词标记.RData")
load("预测集分词结果.RData")

## 将分词后的结果转化为文档词条矩阵 #####
## 针对训练集
corpus = Corpus(VectorSource(results))
train_dtm = DocumentTermMatrix(corpus,control = list(wordLengths = c(1, Inf)))# 文档-词矩阵
## 查看词项数目
train_dtm
## <<DocumentTermMatrix (documents: 17663, terms: 350634)>>
## Non-/sparse entries: 3549676/6189698666
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## 该步骤维简化稀疏矩阵#####
#  Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
train_dtmr <- removeSparseTerms(train_dtm, 0.998) 
## Warning in nr * nc: 整数上溢产生了NA
train_dtmr
## <<DocumentTermMatrix (documents: 17663, terms: 14564)>>
## Non-/sparse entries: 2477031/254766901
## Sparsity           : 99%
## Maximal term length: 12
## Weighting          : term frequency (tf)
# train_dtmr$dimnames$Terms

#查看高频词汇
freq.terms <- sort(colSums(as.matrix(train_dtmr)),decreasing = TRUE)
freq.terms <- tbl_df(data.frame(name = names(freq.terms),fre = freq.terms))
names(freq.terms) <- c("word","freq")
freq.terms$word <- as.factor(freq.terms$word)
summary(freq.terms)
##       word            freq        
##  0.1    :    1   Min.   :   36.0  
##  0.5    :    1   1st Qu.:   79.0  
##  007    :    1   Median :  129.0  
##  1.0    :    1   Mean   :  330.2  
##  1.1    :    1   3rd Qu.:  265.0  
##  1.2    :    1   Max.   :48922.0  
##  (Other):14558
## 绘制词频图
nn <- 6000
freq.terms[freq.terms$freq>nn,] %>%
  ggplot(aes(x = word, y = freq)) +
  theme_bw(base_family = "STKaiti") + 
  geom_bar(stat="identity",fill = "lightblue") + 
  scale_x_discrete() +
  theme(axis.text.x=element_text(angle=75, hjust=1,size = 9))+
  labs(x = "词项",y = "频数",title = "词频图")

## 词云######

sum(freq.terms$freq > 2000)
## [1] 311
set.seed(375) # to make it reproducible
grayLevels <- gray( (freq.terms$freq+10) / (max(freq.terms$freq)+10) )
par(family = "STKaiti",mfrow = c(1,1))
wordcloud(words=freq.terms$word, freq=freq.terms$freq, 
          min.freq=2000, scale=c(4,0.5),random.color=TRUE,
          random.order=FALSE,colors=grayLevels)

wordcloud(words=freq.terms$word, freq=freq.terms$freq,random.order = FALSE, 
          random.color = FALSE,min.freq=2000,
          scale=c(3,0.65), colors=brewer.pal(6,"Dark2"),family = "STKaiti")

data.frame(freq.terms[freq.terms$freq>2000,]) %>%
  wordcloud2::wordcloud2(fontWeight = 'normal',
                       color = 'random-dark',backgroundColor = "gray")
## 自定义图片词云
figPath = system.file("examples/t.png",package = "wordcloud2")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
wordcloud2::wordcloud2(figPath = figPath,
                       size = 1,color = "skyblue")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
  letterCloud("RSG",wordSize = 8.5)
## 针对预测集#####------------------------------------
corpus = Corpus(VectorSource(resultstest))
test_dtm = DocumentTermMatrix(corpus,control = list(wordLengths = c(1, Inf))) # 文档-词矩阵
## 查看词项数目
test_dtm
## <<DocumentTermMatrix (documents: 20000, terms: 374521)>>
## Non-/sparse entries: 3993022/7486426978
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## 该步骤维简化稀疏矩阵#####
#  Start by removing sparse terms:
# This makes a matrix that is 10% empty space, maximum.
test_dtmr <- removeSparseTerms(test_dtm, 0.998) 
## Warning in nr * nc: 整数上溢产生了NA
test_dtmr
## <<DocumentTermMatrix (documents: 20000, terms: 14395)>>
## Non-/sparse entries: 2776507/285123493
## Sparsity           : 99%
## Maximal term length: 12
## Weighting          : term frequency (tf)
# test_dtmr$dimnames$Terms

#查看高频词汇
freq.terms <- sort(colSums(as.matrix(test_dtmr)),decreasing = TRUE)
freq.terms <- tbl_df(data.frame(name = names(freq.terms),fre = freq.terms))
names(freq.terms) <- c("word","freq")
freq.terms$word <- as.factor(freq.terms$word)
summary(freq.terms)
##       word            freq      
##  0.5    :    1   Min.   :   41  
##  007    :    1   1st Qu.:   90  
##  1.0    :    1   Median :  147  
##  1.1    :    1   Mean   :  375  
##  1.2    :    1   3rd Qu.:  300  
##  1.3    :    1   Max.   :56649  
##  (Other):14389
## 绘制词频图
nn <- 6000
freq.terms[freq.terms$freq>nn,] %>%
  ggplot(aes(x = word, y = freq)) +
  theme_bw(base_family = "STKaiti") + 
  geom_bar(stat="identity",fill = "lightblue") + 
  scale_x_discrete() +
  theme(axis.text.x=element_text(angle=75, hjust=1,size = 9))+
  labs(x = "词项",y = "频数",title = "词频图")

## 词云######

sum(freq.terms$freq > 2000)
## [1] 364
set.seed(375) # to make it reproducible
grayLevels <- gray( (freq.terms$freq+10) / (max(freq.terms$freq)+10) )
par(family = "STKaiti",mfrow = c(1,1))
wordcloud(words=freq.terms$word, freq=freq.terms$freq, 
          min.freq=2000, scale=c(4,0.5),random.color=TRUE,
          random.order=FALSE,colors=grayLevels)

wordcloud(words=freq.terms$word, freq=freq.terms$freq,random.order = FALSE, 
          random.color = FALSE,min.freq=2000,
          scale=c(3,0.65), colors=brewer.pal(6,"Dark2"),family = "STKaiti")

data.frame(freq.terms[freq.terms$freq>2000,]) %>%
  wordcloud2::wordcloud2(fontWeight = 'normal',
                         color = 'random-dark',backgroundColor = "gray")
## 自定义图片词云
figPath = system.file("examples/t.png",package = "wordcloud2")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
  wordcloud2::wordcloud2(figPath = figPath,
                         size = 1,color = "skyblue")
data.frame(freq.terms[freq.terms$freq>2000,]) %>%
  letterCloud("RSG",wordSize = 8.5)