Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
[1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales','rvest','magrittr','RCurl','XML','slam')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
library(magrittr)
library(ggplot2)
library(wordcloud)
library(scales)
library(readr)
library(jiebaR)
P <- read_csv("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/Google/GoogleMapReview.csv")
Parsed with column specification:
cols(
city = [31mcol_character()[39m,
artTitle = [31mcol_character()[39m,
rating = [32mcol_double()[39m,
user_ratings_total = [32mcol_double()[39m,
artContent = [31mcol_character()[39m
)
colnames(P)
[1] "city" "artTitle" "rating" "user_ratings_total"
[5] "artContent"
# [1] "city" "artTitle" "rating" "user_ratings_total"
# [5] "artContent" "text"
P$text = P$artContent
library(jiebaR)
userwd = c("搬家公司","崔媽媽","康福","巨力","大吉旺","搬家","乙久利","康福搬家","巨力搬家","崔媽媽基金會","不加價","不亂加價","崔媽媽網站","舊傢俱","農民曆","風水","時辰","傢俱","傢具")
cc = worker()
new_user_word(cc,userwd,tags=rep('n',length(userwd)))
[1] TRUE
stopwd = c(readLines("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/stop_words.txt",encoding="UTF-8"),
"真的","com","XD","imgur","jpg","www","http","i",
"html","bbs","請問","https","po","ptt","cc","Contacts",
"文章","謝謝","之前","網址","想說","知道","問題",
"一下","現在","後","再","人","喔","分享","有點","最近",
"推薦","好像","應該","太","放","已經","看看",
"看到"," 一直","發現","推","這款","耶","今天","之後",
"超","一點","小","原","有沒有","建議","一個","找",
"戴","買","好","比較","覺得","說","想","真的","可能",
"就是","因為","所以","如果","也是","一個","真的","後來",
"時","完","幫","先","搬","請","一車","參考","一次","一台",
"一些","一家","一趟","兩次","一定","這是","這家","至少",
"寫","所有","搬過","本來","是否","前","噸","一直","約","元",
"月","台北","台南","這次","中","台","算","吋","次","公",
"最","家","高","當時","免","問","囉","無","找過","記得",
"辦法","順利","部分","那種","感謝","幾次","送","整個",
"新竹","有人","只能","只","做","不知","OK","台中","我家",
"不到","還要","開","走","點","府","估","真是","這種",
"不用","直接","還會","公司","錢","車","原本","載","舊",
"當初","當天","最後","下","已","一人","一間","事情","地方",
"台南","搬運","搬到","需要","無法","改","以上","兩個","X",
"最好","要求","一起","老闆","幾個","放在","給我","去年",
"感覺","事","卻","找到","以後","希望","重點","給你","還好",
"遇到","通常","這間","上次","家裡","回頭","剩下","桃園",
"縣市","高雄","完全","阿姨","選擇","像是","不錯","方面","裡面",
"第五次","主要","住","歲","花","B","算是","收","就要","兩三個",
"能夠","再來","相關","不想","男朋友","寄","男","感到","認為","崁",
"年","一天","用到","過去","想要","今年","很大","這裡","繼續","箱",
"之外","求","整體而言","進行","原先","一個月","南","左右","回去",
"我要","板橋","説","第一次","家樂福","挑","一塊","最大","搬出去",
"上面","透過","巨力搬家","巨力","原因","來說","打開","有太多","過來",
"妹妹","智慧","box","輸入","傳統","中繼站","信義","子","不收",
"有時候","是從","小東西","NA","一位","and","the","十分","they",
"謝謝","具","包","墊","掉","好好","真","were","樓","超過")
於 '/Volumes/GoogleDrive/Team Drives/THESIS/案例構思/巨力搬家/Data Collection/爬蟲資料/dict/stop_words.txt' 找到不完整的最後一列
library(slam); library(tm); library(tmcn); library(rJava); library(SnowballC); library(jiebaR)
ii = jj = vv =NULL
for(i in 1:length(P$text)) {
x = segment(P$text[i], cc)
x = x[! grepl("[0-9]+",x)]
x = table(x[! x %in% stopwd])
ii = c(ii, rep(i,length(x)))
jj = c(jj, names(x))
vv = c(vv, as.vector(x))
}
jj = as.factor(jj)
l = levels(jj)
dtm = simple_triplet_matrix(ii,as.integer(jj),vv,
dimnames=list(as.character(1:length(P$text)),l))
Error in simple_triplet_matrix(ii, as.integer(jj), vv, dimnames = list(as.character(1:length(P$text)), :
'dimnames' component invalid length or type
library(MASS)
library(wordcloud)
colors = c('gold','purple','cyan','red','orange3','gray',
'pink','green','darkgreen','blue','brown','magenta','deepskyblue3')
n = 200; a = 5
bd = dist(t(dtm[,a:n]),method="binary")
# bd = dist(t(dtm[,a:n]))
mds = isoMDS(bd + 10^(-6))
initial value 72.145980
iter 5 value 40.909806
iter 10 value 38.938357
iter 15 value 38.074441
final value 37.818707
converged
hc = hclust(bd,method="ward.D2")
plot(hc, family = "Heiti TC Light")
k=12
plot(hc,cex=0.7,main="Hierarchical Clusters (Binary Distance)",
sub="",ylab="",xlab="Jaccard (Binary) Distance",
cex.axis=0.6,cex.lab=0.8,cex.main=1, family = "Heiti TC Light")
rect.hclust(hc, k=k, border="pink")
gp = cutree(hc, k=k); table(gp)
gp
1 2 3 4 5 6 7 8 9 10 11 12
27 53 10 34 9 8 9 9 7 13 15 2
wc = col_sums(dtm[,a:n])
p0 = par(mar=c(1,1,1,1))
textplot(mds$points[,1],mds$points[,2],colnames(dtm)[a:n],
show=F, col=colors[gp],cex=1.5*sqrt(wc/mean(wc)),
font=2,axes=F, family = "Microsoft JhengHei"); par(p0)
### Sentiment Analysis
# install.packages('chinese.misc')
library(chinese.misc)
library(tm)
library(jiebaR)
library(Matrix)
library(readr)
library(tidytext)
library(tidyr)
P$artContent = as.character(P$artContent)
P$text = as.character(P$text)
jieba_tokenizer <- worker(user="dict/MH_words.dict")
##### Segmentation
# Setup segment function
MH_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens = P %>%
unnest_tokens(word, artContent, token = MH_tokenizer)
tokens = tokens %>%
mutate(count = n()) %>%
ungroup() %>%
distinct() # 己經統計過數量;去除重複資料
tokens
p <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_positive.txt")
n <- read_file("/Volumes/GoogleDrive/Team\ Drives/THESIS/案例構思/巨力搬家/Data\ Collection/爬蟲資料/dict/MH_negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative) %>% distinct()
LIWC_ch
senti_tokens = tokens %>% inner_join(LIWC_ch)
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector
head(senti_tokens)
tokens %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
count(word) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size = 6)) +
coord_flip() +
theme(text = element_text(family = "Heiti TC Light"))
Joining, by = "word"
Column `word` joining character vector and factor, coercing into character vector