Social Media Analysis 4

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(dplyr)

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidytext)

## Loading required package: tidytext

## Warning: package 'tidytext' was built under R version 3.5.3

require(jiebaR)

## Loading required package: jiebaR

## Warning: package 'jiebaR' was built under R version 3.5.3

## Loading required package: jiebaRD

require(gutenbergr)

## Loading required package: gutenbergr

library(stringr)

## Warning: package 'stringr' was built under R version 3.5.3

library(wordcloud2)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.5.3

library(scales)
require(topicmodels)

## Loading required package: topicmodels

## Warning: package 'topicmodels' was built under R version 3.5.3

require(LDAvis)

## Loading required package: LDAvis

## Warning: package 'LDAvis' was built under R version 3.5.3

tom <- gutenberg_download(74) %>% filter(text!="") %>% distinct(gutenberg_id, text)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

doc = paste0(tom$text,collapse = "")
docVector = unlist(strsplit(doc,"[。.？！]"), use.names=FALSE) #以全形或半形句號斷句
tom1 = data.frame(gutenberg_id = "74" , text = docVector) #gutenberg_id換成自己的書本id
tom1$text <- as.character(tom1$text)
tom1$gutenberg_id <- as.integer(tom1$gutenberg_id)
View(tom1)

tom1 <- tom1 %>%  mutate(chapter = cumsum(str_detect(tom1$text, regex("^CHAPTER (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"))))

View(tom1)

斷詞

tokens <- tom1 %>% unnest_tokens(word, text)

使用BING分析

tom1_bing <- tokens %>%
  inner_join(get_sentiments("bing")) %>%
  count( index = chapter , sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

library(ggplot2)
tom1_bing %>% 
ggplot(aes(index, sentiment)) + 
  geom_col(show.legend = FALSE) +
  geom_col(fill = "#5599FF") + 
  labs(x="CHAPTER", y="SENTIMENT SCORE")

TF-IDF

tom_words <- tokens %>%
  count(chapter, word, sort = TRUE)

tom_tf_idf <- tom_words %>%
  bind_tf_idf(word, chapter, n) %>%
  arrange(desc(tf_idf))

tom_tf_idf %>%
  arrange(desc(tf_idf))

## # A tibble: 25,443 x 6
##    chapter word            n      tf   idf tf_idf
##      <int> <chr>       <int>   <dbl> <dbl>  <dbl>
##  1       2 ling           14 0.00763  3.33 0.0254
##  2       0 <U+3E30>              8 0.00913  2.23 0.0204
##  3       0 <U+613C>              8 0.00913  2.23 0.0204
##  4       5 beetle          9 0.00493  3.33 0.0164
##  5      26 jones           5 0.00601  2.23 0.0134
##  6       2 chow            7 0.00382  3.33 0.0127
##  7       2 ting            7 0.00382  3.33 0.0127
##  8       2 whitewash       7 0.00382  3.33 0.0127
##  9      26 everybody's     3 0.00361  3.33 0.0120
## 10       8 marble          6 0.00359  3.33 0.0120
## # ... with 25,433 more rows

tom_tf_idf = tom_tf_idf[-2:-3,]

tf_idf = tom_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  coord_flip()

## Selecting by tf_idf

tf_idf

tf = tom_tf_idf %>%
  arrange(desc(tf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf") +
  coord_flip()

## Selecting by tf_idf

tf

LDA模型

tom_dtm <-tom_words %>% cast_dtm(chapter,word, n)
tom_lda <- LDA(tom_dtm, k = 2, control = list(seed = 1234))
tom_topics <- tidy(tom_lda, matrix = "beta") 
tom_top_terms <- tom_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic,-beta) %>% 
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

tom_top_terms

Social Media Analysis 4_27 midern

陳柏愷

2019年4月27日

斷詞

使用BING分析

TF-IDF

LDA模型