Text-Minig and Crawl

# Text-mining
library(XML)

## Warning: package 'XML' was built under R version 3.5.2

library(xml2)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(jiebaR)

## Loading required package: jiebaRD

library(ggplot2) # for plotting word frequencies
library(wordcloud) # wordclouds!

## Loading required package: RColorBrewer

library(Rwordseg)

## Loading required package: rJava

## # Version: 0.2-1

library(subprocess)
library(knitr)
library("tm")

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library("tmcn")

## # tmcn Version: 0.2-12

## 
## Attaching package: 'tmcn'

## The following objects are masked from 'package:Rwordseg':
## 
##     insertWords, segmentCN

library("rJava")
library(tidytext)
library(topicmodels)
library(ggplot2)
library(dplyr)
library(stringr)
setwd("/Users/huangzongxian/Desktop/R/101/11 Text-Mining")
dir()

## [1] "Crawl & Text-Mining.R" "new.data.Rdata"        "Text.html"            
## [4] "Text.Rmd"

rm(list = ls())
load('new.data.Rdata')
docs <- as.character(data$content)
cutter <- worker('full', bylines = T) #c("mix", "query", "hmm", "mp", "tag", "full") 
get_noun = function(x){
        index = names(x) %in% c("n","nr","nr1","nr2","nrj","nrf","ns","nsf","nt","nz","nl","ng")
        x[index]
}
new_user_word(cutter, c('陳雄文', '郭台銘', '蘇貞昌', '徐國勇', '蔡英文', '陳水扁','替代役','賀陳旦', '社會役'))

## [1] TRUE

cutter$default = "tag"   #c("mix", "query", "hmm", "mp", "tag", "full") 
text_wb <- sapply(cutter[docs], get_noun)
text_wb <- sapply(text_wb, function(x){
        paste(x, collapse = " ")
})



# set president data.frame
text_df <- data_frame(president = data$president, text = text_wb)

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

president_words <- text_df %>%
        unnest_tokens(word, text) %>%  
        count(president, word, sort = TRUE) %>%
        ungroup() %>%
        bind_tf_idf(word, president, n)
total_words <- president_words %>%
        group_by(president) %>%
        summarize(total = sum(n))
president_words <- left_join(president_words, total_words)

## Joining, by = "president"

president_words # tf-idf with different presidents

## # A tibble: 2,667 x 7
##    president word       n     tf   idf tf_idf total
##    <chr>     <chr>  <int>  <dbl> <dbl>  <dbl> <int>
##  1 蔡英文    蔡       110 0.0428     0      0  2568
##  2 韓國瑜    蔡       102 0.0410     0      0  2486
##  3 宋楚瑜    總統     101 0.0379     0      0  2667
##  4 蔡英文    英文      83 0.0323     0      0  2568
##  5 韓國瑜    英文      79 0.0318     0      0  2486
##  6 蔡英文    總統      76 0.0296     0      0  2568
##  7 韓國瑜    總統      76 0.0306     0      0  2486
##  8 蔡英文    民進黨    69 0.0269     0      0  2568
##  9 韓國瑜    韓國      69 0.0278     0      0  2486
## 10 宋楚瑜    蔡        66 0.0247     0      0  2667
## # … with 2,657 more rows

president_words %>%
        select(-total) %>%
        arrange(desc(tf_idf))

## # A tibble: 2,667 x 6
##    president word       n      tf   idf  tf_idf
##    <chr>     <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 宋楚瑜    蔣經國    36 0.0135  1.10  0.0148 
##  2 宋楚瑜    宋楚瑜    47 0.0176  0.405 0.00715
##  3 宋楚瑜    親民黨    38 0.0142  0.405 0.00578
##  4 宋楚瑜    客廳      12 0.00450 1.10  0.00494
##  5 宋楚瑜    郭        31 0.0116  0.405 0.00471
##  6 蔡英文    通車      25 0.00974 0.405 0.00395
##  7 韓國瑜    通車      24 0.00965 0.405 0.00391
##  8 宋楚瑜    宋         9 0.00337 1.10  0.00371
##  9 宋楚瑜    同         9 0.00337 1.10  0.00371
## 10 宋楚瑜    銘        24 0.00900 0.405 0.00365
## # … with 2,657 more rows

# tf-idf plot
president_words %>%
        select(-total) %>%
        arrange(desc(tf_idf)) %>%
        mutate(word = factor(word, levels = rev(unique(word)))) %>%
        group_by(president) %>%
        top_n(10) %>%
        ungroup %>%
        ggplot(aes(word, tf_idf, fill = president)) +
        geom_col(show.legend = FALSE) +
        labs(x = NULL, y = "tf-idf") +
        facet_wrap(~president, ncol = 2, scales = "free") +
        coord_flip() +
        theme(text = element_text(family = 'STKaiti'))

## Selecting by tf_idf

# word.cloud
library(wordcloud2)
d <- data.frame(president = president_words$president, 
                word = president_words$word,
                freq = president_words$n)

txt_freq <- cbind(as.character(d$word), d$freq) %>% as.data.frame()
txt_freq$V2 <- txt_freq$V2 %>% as.character() %>% as.numeric()
wordcloud2(filter(txt_freq,V2 >1), 
           minSize = 2, fontFamily = "Microsoft YaHei", size = 1)

# plot
ggplot(president_words, aes(n/total, fill = president)) +
        geom_histogram(show.legend = FALSE) +
        xlim(NA, 0.0009) +
        facet_wrap(~president, ncol = 2, scales = "free_y") +
        theme(text = element_text(family="黑體-繁 中黑", size=13),
              axis.ticks = element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 609 rows containing non-finite values (stat_bin).

## Warning: Removed 3 rows containing missing values (geom_bar).

Text-Minig and Crawl

Zong-Xian Huang

2019/12/27