Ch0 About data
** 資料是從文字分析平台下載而來,使用2/1~2/3華航罷工之PTT文章,關鍵字為:華航、罷工、機師,共有2200多篇文章**
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
OS reports request to set locale to "zh_TW.UTF-8" cannot be honored
[1] ""
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
require(data.table)
require(readr)
require(dplyr)
require(stringr)
require(jsonlite)
require(tidyr)
require(tidytext)
require(wordcloud)
require(reshape2)
require(ggraph) #extends ggplot2 to construct network graph.
require(widyr) #calculate pairwise correlation.
require(igraph)
require(topicmodels)
2.1 ~ 2.3 直接透過平台的產出(文章+情緒)
csv <- fread("../data/social_media_artSen.csv",encoding = "UTF-8")
data <- csv %>%
mutate(sentiment = positive_emotion_grade - negative_emotion_grade) %>%
mutate(doc_id = row_number()) %>%
select(doc_id, sentiment)
data[1:150,] %>%
ggplot(aes(doc_id, sentiment)) +
geom_col(show.legend = FALSE)

2.4 最常出現的正面與負面情緒
csv <- fread("../data/social_media_artWordFreq.csv", encoding = "UTF-8")
中文情緒辭典
positive_file <- "../data/positive.txt"
positive_line <- read_lines(positive_file, n_max = 200000, progress = FALSE)
positive <- str_split(positive_line, ",")
negative_file <- "../data/negative.txt"
negative_line <- read_lines(negative_file, n_max = 200000, progress = FALSE)
negative <- str_split(negative_line, ",")
positive <- data.frame(word = positive, sentiments = "positive")
colnames(positive) = c("word","sentiment")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
對應中文情緒辭典
data <- csv
word_counts <- data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
inner_join(LIWC_ch)
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector
word_counts
word_counts %>%
group_by(sentiment) %>%
top_n(10,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(family="黑體-繁 中黑", size=14))+
coord_flip()
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale

par(family=("Heiti TC Light")) #避免字型發生錯誤
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
word_counts %>%
with(wordcloud(word, count, max.words = 100))

par(family=("Heiti TC Light")) #避免字型發生錯誤
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale
word_counts %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)

2.7 計算負面情緒佔該篇評論的比率 (原本是佔該章節)
all_word <- data %>%
mutate(id = group_indices(data,artUrl)) %>%
select(id, word, count) %>%
group_by(id) %>%
summarise(all = sum(count))
negative_word <- data %>%
mutate(id = group_indices(data,artUrl)) %>%
select(id, word, count) %>%
left_join(negative,) %>%
na.omit() %>%
group_by(id) %>%
summarise(negative_sum = sum(count))
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector
merge <- left_join(all_word, negative_word)
Joining, by = "id"
merge %>%
na.omit() %>%
mutate(ratio = negative_sum / all)
Ch3
3.1 Calculate Term Frequency
zipf_ch <- data %>%
mutate(id = group_indices(data,artUrl)) %>%
group_by(id) %>%
mutate(total = sum(count)) %>%
select(id, word, count, total)
ggplot(zipf_ch, aes(count/total))+
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.1)

3.2 Zipf’s Law
freq_by_rank <- data %>%
mutate(id = group_indices(data,artUrl)) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
mutate(total = sum(count)) %>%
mutate(rank = row_number()) %>%
mutate("term frequency" = count / total)
freq_by_rank %>%
ggplot(aes(rank, `term frequency`)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()

儘管我們用中文的資料集,跑出來的結果也跟教科書的很接近 => 符合Zipf’s Law 
rank_subset <- freq_by_rank %>%
filter(rank < 500,
rank > 10)
lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
Call:
lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
Coefficients:
(Intercept) log10(rank)
-1.4822 -0.7466
係數不太接近-1 => 可能是因為資料量不夠大(?)
3.3 The bind_tf_idf function
data_tfidf <- data %>%
bind_tf_idf(word, artUrl, count)
data_tfidf %>%
group_by(artUrl) %>%
mutate(total = sum(count)) %>%
select(-total) %>%
arrange(desc(tf_idf)) %>%
ungroup()%>%
select(artTitle, word, tf_idf)
文章的重要詞彙其實蠻合理的(但是artTitle要考慮一下)
隨機取一篇評論來看看他擁有高tf-idf的字
data_tfidf %>%
filter(artUrl == "https://www.ptt.cc/bbs/Gossiping/M.1549710934.A.821.html") %>%
top_n(10,wt = tf_idf) %>%
ggplot(aes(word, tf_idf), ) +
ggtitle("[問卦]華航有可能跟日航一樣破產嗎?")+
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
theme(text=element_text(family="黑體-繁 中黑", size=14))+
coord_flip()
Warning message:
In strsplit(code, "\n", fixed = TRUE) :
input string 1 is invalid in this locale

3.4是分析不同書本,省略不做
CH4
CH5
#install.packages('tmcn')
library(dplyr)
library(data.table)
library(ggplot2)
library(tmcn)
library(tm)
library(tidytext)
social_media=fread('../data/social_media_artWord.csv',encoding='UTF-8',stringsAsFactors = T)
social_media_artWordFreq=fread('../data/social_media_artWordFreq.csv',encoding='UTF-8',stringsAsFactors = T)
5.1 Tidying a document-term matrix
dtm_data=social_media_artWordFreq %>%
cast_dtm(artTitle, word, count)
dfm_data=social_media_artWordFreq %>%
cast_dfm(.,artTitle,word,count)
5.1.1 Tidying DocumentTermMatrix objects
library(tm)
dfm_data= tidy(dfm_data)
dtm_data_tfidf=dfm_data %>% bind_tf_idf(term, document, count)
dtm_data_tfidf
library(ggplot2)
dtm_data_tfidf %>%
mutate(term = reorder(term, count)) %>%
filter(count > 25) %>%
ggplot(aes(term, count,fill=term)) +
geom_col(show.legend = FALSE) +
xlab(NULL) +
coord_flip()

5.2 Casting tidy text data into a matrix
dtm_data=social_media_artWordFreq %>%
cast_dtm(artTitle, word, count)
dfm_data=social_media_artWordFreq %>%
cast_dfm(.,artTitle,word,count)
library(Matrix)
# cast into a Matrix object
m <- social_media_artWordFreq %>%
cast_sparse(artTitle,word,count)
class(m)
[1] "dgCMatrix"
attr(,"package")
[1] "Matrix"
dim(m)
[1] 1653 22437
CH6
#install.packages('topicmodels')
library(dplyr)
library(data.table)
library(ggplot2)
library(topicmodels)
social_media=fread('../data/social_media_artWord.csv',encoding='UTF-8',stringsAsFactors = T)
social_media_artWordFreq=fread('../data/social_media_artWordFreq.csv',encoding='UTF-8',stringsAsFactors = T)
6.1 Latent Dirichlet allocation
dtm_data=social_media_artWordFreq %>%
cast_dtm(artTitle, word, count)
dtm_data
<<DocumentTermMatrix (documents: 1653, terms: 22437)>>
Non-/sparse entries: 117527/36970834
Sparsity : 100%
Maximal term length: 73
Weighting : term frequency (tf)
social_lda <- LDA(dtm_data, k = 4, control = list(seed = 1234))
social_lda
A LDA_VEM topic model with 4 topics.
6.1.1 Word-topic probabilities
social_topics <- tidy(social_lda, matrix = "beta")
social_topics
top_terms <- social_topics %>%
group_by(topic) %>%
top_n(5, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
library(ggplot2)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

library(tidyr)
beta_spread <- social_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
6.1.2 Document-topic probabilities
ap_documents <- tidy(social_lda, matrix = "gamma")
ap_documents[,2:3]
---
title: "Chinese_tutorial_ch23"
output: html_notebook
author: "MingLun Wu, Yuming Liu"
---

# Ch0 About data
** 資料是從文字分析平台下載而來，使用2/1~2/3華航罷工之PTT文章，關鍵字為:華航、罷工、機師，共有2200多篇文章**
```{r}
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
require(data.table)
require(readr)
require(dplyr)
require(stringr)
require(jsonlite)
require(tidyr)

require(tidytext)

require(wordcloud)
require(reshape2)

require(ggraph) #extends ggplot2 to construct network graph.
require(widyr) #calculate pairwise correlation.

require(igraph)
require(topicmodels)
```



# 2.1 ~ 2.3 直接透過平台的產出(文章+情緒)
```{r}
csv <- fread("../data/social_media_artSen.csv",encoding = "UTF-8")
```
```{r}
data <- csv %>% 
  mutate(sentiment = positive_emotion_grade - negative_emotion_grade) %>%
  mutate(doc_id = row_number()) %>%
  select(doc_id, sentiment)

data[1:150,] %>% 
ggplot(aes(doc_id, sentiment)) +
  geom_col(show.legend = FALSE)
```
### 2.4 最常出現的正面與負面情緒
```{r}
csv <- fread("../data/social_media_artWordFreq.csv", encoding = "UTF-8")
```

##### 中文情緒辭典
```{r}
positive_file <- "../data/positive.txt"
positive_line <- read_lines(positive_file, n_max = 200000, progress = FALSE)

positive <- str_split(positive_line, ",")

negative_file <- "../data/negative.txt"
negative_line <- read_lines(negative_file, n_max = 200000, progress = FALSE)

negative <- str_split(negative_line, ",")

positive <- data.frame(word = positive, sentiments = "positive")
colnames(positive) = c("word","sentiment")

negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")

LIWC_ch <- rbind(positive, negative)
```

### 對應中文情緒辭典
```{r}
data <- csv
word_counts <- data %>%
  select(word,count) %>% 
  group_by(word) %>%
  summarise(count = sum(count)) %>%
  inner_join(LIWC_ch)
word_counts
```

```{r}
word_counts %>%
  group_by(sentiment) %>%
  top_n(10,wt = count) %>%
  ungroup() %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  coord_flip()
```

```{r}
par(family=("Heiti TC Light")) #避免字型發生錯誤
word_counts %>%
  with(wordcloud(word, count, max.words = 100))
```

```{r}
par(family=("Heiti TC Light")) #避免字型發生錯誤
word_counts %>%
  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)
```
### 2.7 計算負面情緒佔該篇評論的比率 (原本是佔該章節)
```{r}
all_word <- data %>%
  mutate(id = group_indices(data,artUrl)) %>%
  select(id, word, count) %>%
  group_by(id) %>%
  summarise(all = sum(count))

negative_word <- data %>%
  mutate(id = group_indices(data,artUrl)) %>%
  select(id, word, count) %>%
  left_join(negative,) %>%
  na.omit() %>%
  group_by(id) %>%
  summarise(negative_sum = sum(count))

merge <- left_join(all_word, negative_word) 
merge %>%
  na.omit() %>%
  mutate(ratio = negative_sum / all)
```

#### Ch3 
### 3.1 Calculate Term Frequency
```{r}
zipf_ch <- data %>% 
  mutate(id = group_indices(data,artUrl)) %>%
  group_by(id) %>%
  mutate(total = sum(count)) %>%
  select(id, word, count, total) 

ggplot(zipf_ch, aes(count/total))+
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.1)
```

### 3.2 Zipf's Law
```{r}
freq_by_rank <- data %>% 
  mutate(id = group_indices(data,artUrl)) %>% 
  group_by(word) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  mutate(total = sum(count)) %>%
  mutate(rank = row_number()) %>%
  mutate("term frequency" = count / total)
  
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()
```

**儘管我們用中文的資料集，跑出來的結果也跟教科書的很接近 => 符合Zipf's Law**
<img src="https://www.tidytextmining.com/03-tf-idf_files/figure-html/zipf-1.png">

```{r}
rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
```
**係數不太接近-1 => 可能是因為資料量不夠大(?)**

### 3.3 The bind_tf_idf function
```{r}
data_tfidf <- data %>%
  bind_tf_idf(word, artUrl, count)
```

```{r}
data_tfidf %>% 
  group_by(artUrl) %>%
  mutate(total = sum(count)) %>%
  select(-total) %>% 
  arrange(desc(tf_idf)) %>%
  ungroup()%>%
  select(artTitle, word, tf_idf)
```

**文章的重要詞彙其實蠻合理的(但是artTitle要考慮一下)**

### 隨機取一篇評論來看看他擁有高tf-idf的字
```{r}
data_tfidf %>% 
  filter(artUrl == "https://www.ptt.cc/bbs/Gossiping/M.1549710934.A.821.html") %>%
  top_n(10,wt = tf_idf) %>%
  ggplot(aes(word, tf_idf), ) +
  ggtitle("[問卦]華航有可能跟日航一樣破產嗎?")+
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  theme(text=element_text(family="黑體-繁 中黑", size=14))+
  coord_flip()
```

### 3.4是分析不同書本，省略不做



####CH4










####CH5







```{r}
#install.packages('tmcn')
library(dplyr)
library(data.table)
library(ggplot2)
library(tmcn)
library(tm)
library(tidytext)
```


```{r}

social_media=fread('../data/social_media_artWord.csv',encoding='UTF-8',stringsAsFactors = T)
social_media_artWordFreq=fread('../data/social_media_artWordFreq.csv',encoding='UTF-8',stringsAsFactors = T)

```





###5.1 Tidying a document-term matrix





```{r}
dtm_data=social_media_artWordFreq %>%
  cast_dtm(artTitle, word, count)
```



```{r}
dfm_data=social_media_artWordFreq %>%
  cast_dfm(.,artTitle,word,count)
```



####5.1.1 Tidying DocumentTermMatrix objects

```{r}
library(tm)
dfm_data= tidy(dfm_data)
dtm_data_tfidf=dfm_data %>% bind_tf_idf(term, document, count) 
dtm_data_tfidf 
```


```{r}
library(ggplot2)

dtm_data_tfidf %>%

mutate(term = reorder(term, count)) %>%
filter(count > 25) %>%
ggplot(aes(term, count,fill=term)) +
geom_col(show.legend = FALSE) +
xlab(NULL) +
coord_flip() 

```



###5.2 Casting tidy text data into a matrix




```{r}
dtm_data=social_media_artWordFreq %>%
  cast_dtm(artTitle, word, count)
```



```{r}
dfm_data=social_media_artWordFreq %>%
  cast_dfm(.,artTitle,word,count)
```


```{r}
library(Matrix)

# cast into a Matrix object
m <- social_media_artWordFreq %>%
  cast_sparse(artTitle,word,count)

class(m)
```

```{r}
dim(m)
```




####CH6


```{r}
#install.packages('topicmodels')
library(dplyr)
library(data.table)
library(ggplot2)
library(topicmodels)
```




```{r}

social_media=fread('../data/social_media_artWord.csv',encoding='UTF-8',stringsAsFactors = T)
social_media_artWordFreq=fread('../data/social_media_artWordFreq.csv',encoding='UTF-8',stringsAsFactors = T)

```



###6.1  Latent Dirichlet allocation







```{r}
dtm_data=social_media_artWordFreq %>%
  cast_dtm(artTitle, word, count)
dtm_data
```





```{r}
social_lda <- LDA(dtm_data, k = 4, control = list(seed = 1234))
social_lda

```

####6.1.1 Word-topic probabilities

```{r}
social_topics <- tidy(social_lda, matrix = "beta")
social_topics 
```




```{r}

top_terms <- social_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms
```


```{r}
library(ggplot2)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
```




```{r}
library(tidyr)

beta_spread <- social_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread
```



###6.1.2 Document-topic probabilities


```{r}
ap_documents <- tidy(social_lda, matrix = "gamma")
ap_documents[,2:3]
```








<style>

em {
    color: #FFEA6C;
    background: #7D7D7D;
}

.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

.qiz {
  line-height: 1.75;
  background: #f0f0f0;
  border-left: 12px solid #ccffcc;
  padding: 4px;
  padding-left: 10px;
  color: #009900;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}


h3{
  color: #b36b00;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #006000;
  background: #f8f8f8;
  line-height: 1.5;
  font-weight: bold;
}

h6 {
    color: #006000;
    background: #00ffff;
    line-height: 2;
    font-weight: bold;
}

</style>



