Encoding to UTF-8
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
[1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"



Install essential package
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','rvest','magrittr','RCurl','XML','slam')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)



Loading library
library(dplyr)
library(magrittr)
library(ggplot2)
library(wordcloud)
library(scales)
library(readr)
library(jiebaR)



Loading data (mobile01)
P <- read_csv("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/interview/MHinterview.csv")
Parsed with column specification:
cols(
  artTitle = col_character(),
  artDate = col_date(format = ""),
  artTime = col_time(format = ""),
  artUrl = col_character(),
  artContent = col_character()
)
P$text = P$artContent
Loading JiebaR package and setup stop words dictionary
library(jiebaR)
userwd = c("搬家公司","崔媽媽","康福","巨力","大吉旺","搬家","乙久利","康福搬家","巨力搬家","崔媽媽基金會","不加價","不亂加價","崔媽媽網站","舊傢俱","農民曆","風水","時辰")
cc = worker()
new_user_word(cc,userwd,tags=rep('n',length(userwd)))
[1] TRUE
stopwd = c(readLines("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/stop_words.txt",encoding="UTF-8"),
           "真的","com","XD","imgur","jpg","www","http","i",
           "html","bbs","請問","https","po","ptt","cc","Contacts",
           "文章","謝謝","之前","網址","想說","知道","問題",
           "一下","現在","後","再","人","喔","分享","有點","最近",
           "推薦","好像","應該","太","放","已經","看看",
           "看到","  一直","發現","推","這款","耶","今天","之後",
           "超","一點","小","原","有沒有","建議","一個","找",
           "戴","買","好","比較","覺得","說","想","真的","可能",
           "就是","因為","所以","如果","也是","一個","真的","後來",
           "時","完","幫","先","搬","請","一車","參考","一次","一台",
           "一些","一家","一趟","兩次","一定","這是","這家","至少",
           "寫","所有","搬過","本來","是否","前","噸","一直","約","元",
           "月","台北","台南","這次","中","台","算","吋","次","公",
           "最","家","高","當時","免","問","囉","無","找過","記得",
           "辦法","順利","部分","那種","感謝","幾次","送","整個",
           "新竹","有人","只能","只","做","不知","OK","台中","我家",
           "不到","還要","開","走","點","府","估","真是","這種",
           "不用","直接","還會","公司","錢","車","原本","載","舊",
           "當初","當天","最後","下","已","一人","一間","事情","地方",
           "台南","搬運","搬到","需要","無法","改","以上","兩個","X",
           "最好","要求","一起","老闆","幾個","放在","給我","去年",
           "感覺","事","卻","找到","以後","希望","重點","給你","還好",
           "遇到","通常","這間","上次","家裡","回頭","剩下","桃園",
           "縣市","高雄","完全","阿姨","選擇","像是","不錯","方面","裡面",
           "第五次","主要","住","歲","花","B","算是","收","就要","兩三個",
           "能夠","再來","相關","不想","男朋友","寄","男","感到","認為","崁",
           "年","一天","用到","過去","想要","今年","很大","這裡","繼續","箱",
           "之外","求","整體而言","進行","原先","一個月","南","左右","回去",
           "我要","板橋","説","第一次","家樂福","挑","一塊","最大","搬出去",
           "上面","透過","巨力搬家","巨力","原因","來說","打開","有太多","過來",
           "妹妹","智慧","box","輸入","傳統","中繼站","信義","子","不收",
           "有時候","是從","小東西")
於 '/Volumes/GoogleDrive/Team Drives/THESIS/案例構思/巨力搬家/Data Collection/爬蟲資料/dict/stop_words.txt' 找到不完整的最後一列



Descriptive Statistics



Select artDate and change format
P$artDate= P$artDate %>% as.Date("%Y/%m/%d")
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data = P %>% dplyr::select(artDate)



Grouping date
data <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())



Line graph of articles
plot_date <- data %>% ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 1) + 
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("搬家-討論文章數(Mobile01)") + 
  xlab("Date") + 
  ylab("Count") + 
  theme(text = element_text(family = "Heiti TC Light")) 

plot_date



Barplot (num of words)
library(slam); library(tm); library(tmcn); library(rJava); library(SnowballC); library(jiebaR)

ii = jj = vv =NULL
for(i in 1:length(P$text)) {
  x = segment(P$text[i], cc)
  x = x[! grepl("[0-9]+",x)]
  x = table(x[! x %in% stopwd])
  ii = c(ii, rep(i,length(x)))
  jj = c(jj, names(x))
  vv = c(vv, as.vector(x))
}
jj = as.factor(jj)

l = levels(jj)
dtm = simple_triplet_matrix(ii,as.integer(jj),vv,
             dimnames=list(as.character(1:length(P$text)),l))
dim(dtm)
[1]    9 1265
save(dtm, file="MHmb01.rdata")
dtm = dtm[, order(col_sums(dtm), decreasing=T)]
col_sums(dtm)[1:40] %>% barplot(las=2, family = "Heiti TC Light")



Hierarchical Clusters (binary distance)
library(MASS)
library(wordcloud)
colors = c('gold','purple','cyan','red','orange3','gray',
           'pink','green','darkgreen','blue','brown','magenta','deepskyblue3')

n = 200; a = 5
bd = dist(t(dtm[,a:n]),method="binary")
# bd = dist(t(dtm[,a:n]))
mds =  isoMDS(bd + 10^(-6))
initial  value 35.255153 
final  value 35.255153 
converged
hc = hclust(bd,method="ward.D2")
plot(hc, family = "Heiti TC Light") 


k=12
plot(hc,cex=0.7,main="Hierarchical Clusters (Binary Distance)",
     sub="",ylab="",xlab="Jaccard (Binary) Distance",
     cex.axis=0.6,cex.lab=0.8,cex.main=1, family = "Heiti TC Light")
rect.hclust(hc, k=k, border="pink")



Word Cloud
gp = cutree(hc, k=k); table(gp)
gp
 1  2  3  4  5  6  7  8  9 10 11 12 
24 12 15 17 24 18 19 21 14 12 10 10 
wc = col_sums(dtm[,a:n])
p0 = par(mar=c(1,1,1,1))
textplot(mds$points[,1],mds$points[,2],colnames(dtm)[a:n],
         show=F, col=colors[gp],cex=1.5*sqrt(wc/mean(wc)),
         font=2,axes=F, family = "Microsoft JhengHei"); par(p0)



### Sentiment Analysis

Loading Chinese Texting needed packages
# install.packages('chinese.misc')
library(chinese.misc)
library(tm)
library(jiebaR)
library(Matrix)
library(readr)
library(tidytext)
library(tidyr)



Change the format of the dataset
P$artContent = as.character(P$artContent)
P$text = as.character(P$text)
P$artDate = as.Date(P$artDate)



Loading customize dict in Jieba
jieba_tokenizer <- worker(user="dict/MH_words.dict")



##### Segmentation

# Setup segment function
MH_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens = P %>% 
  unnest_tokens(word, artContent, token = MH_tokenizer)



Count for the amount of words in corpus

tokens = tokens %>%
  dplyr::select(artDate, word) %>%
  group_by(artDate, word) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  distinct() # 己經統計過數量;去除重複資料

tokens



LIWC dict (positive, negative)
p <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_positive.txt")
n <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative) %>% distinct()

LIWC_ch



Join corpus with LIWC (ot make sure the word is belongs to positive or negative)
senti_tokens = tokens %>% inner_join(LIWC_ch) 
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector
head(senti_tokens) 



Find out all date in time period
all_dates <- 
  expand.grid(seq(as.Date(min(data$artDate)), as.Date(max(data$artDate)), by="day"), c("positive", "negative"))
head(all_dates)



Line chart of sentiment (positive, negative)

senti_tokens$artDate = as.Date(senti_tokens$artDate)
# head(senti_tokens)        artTitle, artDate, artTime, artUrl, text, word, sentiment
# head(LIWC_ch)             word, sentiment
plot_table <- senti_tokens %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count = n())
Joining, by = c("word", "sentiment")
Column `word` joining character vector and factor, coercing into character vector
plot_table %>%
  ggplot() +
  geom_line(aes(x = artDate, y = count, colour = sentiment)) +
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  xlab("Date") + ylab("Count")



Common sentiment words in LIWC dict (positive, negative)
tokens %>% 
  inner_join(LIWC_ch) %>% 
  group_by(sentiment) %>%
  count(word) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=11)) +
  coord_flip() + 
  theme(text = element_text(family = "Heiti TC Light")) 
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector



---
title: "台灣搬家產業文字分析（深度訪談）/ Text Mining of Moving Industry in Taiwan (Interview)"
author: "Karen Yang 楊凱倫"
output: html_notebook
---

<br>
<br>

+ Corpus Type: Interview (someone who has experience in moving home)
+ Time Period: 2019/04/27 - 2019/06/02
+ Number of corpus: 9

<br>
<br>

##### Encoding to UTF-8
```{r}
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
```

<br>
<br>

##### Install essential package
```{r}
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','rvest','magrittr','RCurl','XML','slam')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

<br>
<br>

##### Loading library
```{r}
library(dplyr)
library(magrittr)
library(ggplot2)
library(wordcloud)
library(scales)
library(readr)
library(jiebaR)
```

<br>
<br>

##### Loading data (mobile01)
```{r}
P <- read_csv("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/interview/MHinterview.csv")
P$text = P$artContent
```



##### Loading JiebaR package and setup stop words dictionary
```{r}
library(jiebaR)
userwd = c("搬家公司","崔媽媽","康福","巨力","大吉旺","搬家","乙久利","康福搬家","巨力搬家","崔媽媽基金會","不加價","不亂加價","崔媽媽網站","舊傢俱","農民曆","風水","時辰")
cc = worker()
new_user_word(cc,userwd,tags=rep('n',length(userwd)))

stopwd = c(readLines("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/stop_words.txt",encoding="UTF-8"),
           "真的","com","XD","imgur","jpg","www","http","i",
           "html","bbs","請問","https","po","ptt","cc","Contacts",
           "文章","謝謝","之前","網址","想說","知道","問題",
           "一下","現在","後","再","人","喔","分享","有點","最近",
           "推薦","好像","應該","太","放","已經","看看",
           "看到","  一直","發現","推","這款","耶","今天","之後",
           "超","一點","小","原","有沒有","建議","一個","找",
           "戴","買","好","比較","覺得","說","想","真的","可能",
           "就是","因為","所以","如果","也是","一個","真的","後來",
           "時","完","幫","先","搬","請","一車","參考","一次","一台",
           "一些","一家","一趟","兩次","一定","這是","這家","至少",
           "寫","所有","搬過","本來","是否","前","噸","一直","約","元",
           "月","台北","台南","這次","中","台","算","吋","次","公",
           "最","家","高","當時","免","問","囉","無","找過","記得",
           "辦法","順利","部分","那種","感謝","幾次","送","整個",
           "新竹","有人","只能","只","做","不知","OK","台中","我家",
           "不到","還要","開","走","點","府","估","真是","這種",
           "不用","直接","還會","公司","錢","車","原本","載","舊",
           "當初","當天","最後","下","已","一人","一間","事情","地方",
           "台南","搬運","搬到","需要","無法","改","以上","兩個","Ｘ",
           "最好","要求","一起","老闆","幾個","放在","給我","去年",
           "感覺","事","卻","找到","以後","希望","重點","給你","還好",
           "遇到","通常","這間","上次","家裡","回頭","剩下","桃園",
           "縣市","高雄","完全","阿姨","選擇","像是","不錯","方面","裡面",
           "第五次","主要","住","歲","花","B","算是","收","就要","兩三個",
           "能夠","再來","相關","不想","男朋友","寄","男","感到","認為","崁",
           "年","一天","用到","過去","想要","今年","很大","這裡","繼續","箱",
           "之外","求","整體而言","進行","原先","一個月","南","左右","回去",
           "我要","板橋","説","第一次","家樂福","挑","一塊","最大","搬出去",
           "上面","透過","巨力搬家","巨力","原因","來說","打開","有太多","過來",
           "妹妹","智慧","box","輸入","傳統","中繼站","信義","子","不收",
           "有時候","是從","小東西")
```


<br>
<br>


### Descriptive Statistics

<br>
<br>

##### Select artDate and change format
```{r}
P$artDate= P$artDate %>% as.Date("%Y/%m/%d")
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data = P %>% dplyr::select(artDate)
```

<br>
<br>

##### Grouping date
```{r}
data <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())
```

<br>
<br>

##### Line graph of articles
```{r}
plot_date <- data %>% ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 1) + 
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("搬家－討論文章數（Mobile01）") + 
  xlab("Date") + 
  ylab("Count") + 
  theme(text = element_text(family = "Heiti TC Light")) 

plot_date
```

<br>
<br>

##### Barplot (num of words)
```{r}
library(slam); library(tm); library(tmcn); library(rJava); library(SnowballC); library(jiebaR)

ii = jj = vv =NULL
for(i in 1:length(P$text)) {
  x = segment(P$text[i], cc)
  x = x[! grepl("[0-9]+",x)]
  x = table(x[! x %in% stopwd])
  ii = c(ii, rep(i,length(x)))
  jj = c(jj, names(x))
  vv = c(vv, as.vector(x))
}
jj = as.factor(jj)

l = levels(jj)
dtm = simple_triplet_matrix(ii,as.integer(jj),vv,
             dimnames=list(as.character(1:length(P$text)),l))
dim(dtm)
save(dtm, file="MHmb01.rdata")
dtm = dtm[, order(col_sums(dtm), decreasing=T)]
col_sums(dtm)[1:40] %>% barplot(las=2, family = "Heiti TC Light")
```

<br>
<br>

##### Hierarchical Clusters (binary distance)
```{r}
library(MASS)
library(wordcloud)
colors = c('gold','purple','cyan','red','orange3','gray',
           'pink','green','darkgreen','blue','brown','magenta','deepskyblue3')

n = 200; a = 5
bd = dist(t(dtm[,a:n]),method="binary")
# bd = dist(t(dtm[,a:n]))
mds =  isoMDS(bd + 10^(-6))
hc = hclust(bd,method="ward.D2")
plot(hc, family = "Heiti TC Light") 

k=12
plot(hc,cex=0.7,main="Hierarchical Clusters (Binary Distance)",
     sub="",ylab="",xlab="Jaccard (Binary) Distance",
     cex.axis=0.6,cex.lab=0.8,cex.main=1, family = "Heiti TC Light")
rect.hclust(hc, k=k, border="pink")
```

<br>
<br>

##### Word Cloud
```{r}
gp = cutree(hc, k=k); table(gp)
wc = col_sums(dtm[,a:n])
p0 = par(mar=c(1,1,1,1))
textplot(mds$points[,1],mds$points[,2],colnames(dtm)[a:n],
         show=F, col=colors[gp],cex=1.5*sqrt(wc/mean(wc)),
         font=2,axes=F, family = "Microsoft JhengHei"); par(p0)
```

<br>
<br>
### Sentiment Analysis

##### Loading Chinese Texting needed packages
```{r}
# install.packages('chinese.misc')
library(chinese.misc)
library(tm)
library(jiebaR)
library(Matrix)
library(readr)
library(tidytext)
library(tidyr)
```

<br>
<br>

##### Change the format of the dataset 
```{r}
P$artContent = as.character(P$artContent)
P$text = as.character(P$text)
P$artDate = as.Date(P$artDate)
```

<br>
<br>

##### Loading customize dict in Jieba
```{r}
jieba_tokenizer <- worker(user="dict/MH_words.dict")
```

<br>
<br>
##### Segmentation
```{r}
# Setup segment function
MH_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens = P %>% 
  unnest_tokens(word, artContent, token = MH_tokenizer)
```

<br>
<br>

# Count for the amount of words in corpus
```{r}
tokens = tokens %>%
  dplyr::select(artDate, word) %>%
  group_by(artDate, word) %>%
  mutate(count = n()) %>%
  ungroup() %>%
  distinct() # 己經統計過數量；去除重複資料

tokens
```

<br>
<br>

##### LIWC dict (positive, negative)
```{r}
p <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_positive.txt")
n <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative) %>% distinct()

LIWC_ch
```

<br>
<br>

##### Join corpus with LIWC (ot make sure the word is belongs to positive or negative)
```{r}
senti_tokens = tokens %>% inner_join(LIWC_ch) 
head(senti_tokens) 
```

<br>
<br>

##### Find out all date in time period
```{r}
all_dates <- 
  expand.grid(seq(as.Date(min(data$artDate)), as.Date(max(data$artDate)), by="day"), c("positive", "negative"))
head(all_dates)
```

<br>
<br>

# Line chart of sentiment (positive, negative)
```{r}
senti_tokens$artDate = as.Date(senti_tokens$artDate)
# head(senti_tokens)        artTitle, artDate, artTime, artUrl, text, word, sentiment
# head(LIWC_ch)             word, sentiment
plot_table <- senti_tokens %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count = n())

plot_table %>%
  ggplot() +
  geom_line(aes(x = artDate, y = count, colour = sentiment)) +
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  xlab("Date") + ylab("Count")
```

<br>
<br>

##### Common sentiment words in LIWC dict (positive, negative)

```{r}
tokens %>% 
  inner_join(LIWC_ch) %>% 
  group_by(sentiment) %>%
  count(word) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=11)) +
  coord_flip() + 
  theme(text = element_text(family = "Heiti TC Light")) 
```

<br>
<br>

<style>
.caption {
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #444444;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h3{
  color: #468284;
  background: #ffe0b3;
  line-height: 2;
  font-weight: bold;
}

h5{
  color: #468284;
  background: #ffffe0;
  line-height: 2;
  font-weight: bold;
}

em{
  color: #468284;
  }

</style>