# Text-mining
library(XML)
## Warning: package 'XML' was built under R version 3.5.2
library(xml2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jiebaR)
## Loading required package: jiebaRD
library(ggplot2) # for plotting word frequencies
library(wordcloud) # wordclouds!
## Loading required package: RColorBrewer
library(Rwordseg)
## Loading required package: rJava
## # Version: 0.2-1
library(subprocess)
library(knitr)
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("tmcn")
## # tmcn Version: 0.2-12
##
## Attaching package: 'tmcn'
## The following objects are masked from 'package:Rwordseg':
##
## insertWords, segmentCN
library("rJava")
library(tidytext)
library(topicmodels)
library(ggplot2)
library(dplyr)
library(stringr)
setwd("/Users/huangzongxian/Desktop/R/101/11 Text-Mining")
dir()
## [1] "Crawl & Text-Mining.R" "new.data.Rdata" "Text.html"
## [4] "Text.Rmd"
rm(list = ls())
load('new.data.Rdata')
docs <- as.character(data$content)
cutter <- worker('full', bylines = T) #c("mix", "query", "hmm", "mp", "tag", "full")
get_noun = function(x){
index = names(x) %in% c("n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz","nl","ng")
x[index]
}
new_user_word(cutter, c('陳雄文', '郭台銘', '蘇貞昌', '徐國勇', '蔡英文', '陳水扁','替代役','賀陳旦', '社會役'))
## [1] TRUE
cutter$default = "tag" #c("mix", "query", "hmm", "mp", "tag", "full")
text_wb <- sapply(cutter[docs], get_noun)
text_wb <- sapply(text_wb, function(x){
paste(x, collapse = " ")
})
# set president data.frame
text_df <- data_frame(president = data$president, text = text_wb)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
president_words <- text_df %>%
unnest_tokens(word, text) %>%
count(president, word, sort = TRUE) %>%
ungroup() %>%
bind_tf_idf(word, president, n)
total_words <- president_words %>%
group_by(president) %>%
summarize(total = sum(n))
president_words <- left_join(president_words, total_words)
## Joining, by = "president"
president_words # tf-idf with different presidents
## # A tibble: 2,667 x 7
## president word n tf idf tf_idf total
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int>
## 1 蔡英文 蔡 110 0.0428 0 0 2568
## 2 韓國瑜 蔡 102 0.0410 0 0 2486
## 3 宋楚瑜 總統 101 0.0379 0 0 2667
## 4 蔡英文 英文 83 0.0323 0 0 2568
## 5 韓國瑜 英文 79 0.0318 0 0 2486
## 6 蔡英文 總統 76 0.0296 0 0 2568
## 7 韓國瑜 總統 76 0.0306 0 0 2486
## 8 蔡英文 民進黨 69 0.0269 0 0 2568
## 9 韓國瑜 韓國 69 0.0278 0 0 2486
## 10 宋楚瑜 蔡 66 0.0247 0 0 2667
## # … with 2,657 more rows
president_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 2,667 x 6
## president word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 宋楚瑜 蔣經國 36 0.0135 1.10 0.0148
## 2 宋楚瑜 宋楚瑜 47 0.0176 0.405 0.00715
## 3 宋楚瑜 親民黨 38 0.0142 0.405 0.00578
## 4 宋楚瑜 客廳 12 0.00450 1.10 0.00494
## 5 宋楚瑜 郭 31 0.0116 0.405 0.00471
## 6 蔡英文 通車 25 0.00974 0.405 0.00395
## 7 韓國瑜 通車 24 0.00965 0.405 0.00391
## 8 宋楚瑜 宋 9 0.00337 1.10 0.00371
## 9 宋楚瑜 同 9 0.00337 1.10 0.00371
## 10 宋楚瑜 銘 24 0.00900 0.405 0.00365
## # … with 2,657 more rows
# tf-idf plot
president_words %>%
select(-total) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(president) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(word, tf_idf, fill = president)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~president, ncol = 2, scales = "free") +
coord_flip() +
theme(text = element_text(family = 'STKaiti'))
## Selecting by tf_idf

# word.cloud
library(wordcloud2)
d <- data.frame(president = president_words$president,
word = president_words$word,
freq = president_words$n)
txt_freq <- cbind(as.character(d$word), d$freq) %>% as.data.frame()
txt_freq$V2 <- txt_freq$V2 %>% as.character() %>% as.numeric()
wordcloud2(filter(txt_freq,V2 >1),
minSize = 2, fontFamily = "Microsoft YaHei", size = 1)
# plot
ggplot(president_words, aes(n/total, fill = president)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~president, ncol = 2, scales = "free_y") +
theme(text = element_text(family="黑體-繁 中黑", size=13),
axis.ticks = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 609 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).
