library(data.table)
library(dplyr)
library(stringr)
Amazon=fread('./data/Amazon_reviews.csv')
# pre-processing:
reviews=Amazon$artContent
#reviews=reviews[-2]
clean=function(review){
reviews <- as.list(t(review))
reviews <-str_trim(reviews)
reviews <- toupper(reviews)
reviews <- gsub("'", "", reviews) # remove apostrophes
reviews <- gsub("[[:punct:]]", " ", reviews) # replace punctuation with space
reviews <- gsub("[[:cntrl:]]", " ", reviews) # replace control characters with space
reviews <- gsub("[[:digit:]]"," ", reviews) # remove number
reviews <- gsub("^[[:space:]]+", "", reviews) # remove whitespace at beginning of documents
reviews <- gsub("[[:space:]]+$", "", reviews) # remove whitespace at end of documents
reviews <- tolower(reviews) # force to lowercase
reviews
}
Sentiment analysis
#install.packages('syuzhet')
library(syuzhet)
X=data.frame()
split(Amazon,Amazon$tool_name) %>% sapply(.,function(v){
txt=clean(v$artContent)
senti = get_nrc_sentiment(txt)
f = data.frame(t(colSums(senti)))
X <<- rbind(X,f)
rownames(X)[nrow(X)] <<- v$tool_name
})
Visualization
#devtools::install_github("jbkunst/highcharter")
library(highcharter)
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
library(dplyr)
pmap = function(dx,st2="") { # perceptual map
hchart(princomp(dx),cor=T) %>%
hc_chart(zoomType = "xy") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_tooltip(headerFormat="",pointFormat="{series.name}") %>%
hc_title(text="Emotions on Brands of Hammers") %>%
hc_subtitle(text=paste0("93 brands ",st2))}
pmap(X[,1:8]/(rowSums(X[,1:8])+1))
#save(X,file='0929.Rdata')
尺度縮減字詞共現
建立字頻表
library(tm)
library(stringr)
dtm = Amazon$artContent %>%
iconv(to = "utf-8", sub="") %>%
str_trim(.) %>%
toupper() %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix %>%
removeSparseTerms(0.998)
dtm # (documents: 14156, terms: 1030)
dtm_tmp=dtm
TF-IDF 過濾
library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
dtm=dtm_tmp
dtm = dtm[, tfidf > 0.24 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
使用tSNE做尺度縮減
#install.packages('Rtsne')
library(Rtsne)
n = 636
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
scale %>% t %>% Rtsne(
check_duplicates = FALSE, theta=0.0, max_iter=3200)
層級式分群
Y = tsne$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 100 # number of clusters
g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters
[1] 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4
[24] 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 6 6 6 6
[47] 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 8
[70] 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9 9 10 10 10 10 10 10
[93] 11 11 11 12 12 12 13 15
文字雲
# install.packages('randomcoloR')
library(randomcoloR)
library(wordcloud)
wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)
png("./Amazon2.png", width=3200, height=1800)
textplot(
Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
col=colors[g],
cex= 0.3 + 1.25 * sqrt(wc/mean(wc)),
font=2)
dev.off()
LS0tDQp0aXRsZTog5oOF57eS5YiG5p6Q44CBVEYtaWRm44CB5bC65bqm57iu5rib5a2X6Kme5YWx54++DQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyfQ0KbGlicmFyeShkYXRhLnRhYmxlKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoc3RyaW5ncikNCg0KQW1hem9uPWZyZWFkKCcuL2RhdGEvQW1hem9uX3Jldmlld3MuY3N2JykNCg0KIyBwcmUtcHJvY2Vzc2luZzoNCnJldmlld3M9QW1hem9uJGFydENvbnRlbnQNCg0KI3Jldmlld3M9cmV2aWV3c1stMl0NCmNsZWFuPWZ1bmN0aW9uKHJldmlldyl7DQogIHJldmlld3MgPC0gYXMubGlzdCh0KHJldmlldykpDQogIHJldmlld3MgPC1zdHJfdHJpbShyZXZpZXdzKSANCiAgcmV2aWV3cyA8LSB0b3VwcGVyKHJldmlld3MpIA0KICByZXZpZXdzIDwtIGdzdWIoIiciLCAiIiwgcmV2aWV3cykgICMgcmVtb3ZlIGFwb3N0cm9waGVzDQogIHJldmlld3MgPC0gZ3N1YigiW1s6cHVuY3Q6XV0iLCAiICIsIHJldmlld3MpICAjIHJlcGxhY2UgcHVuY3R1YXRpb24gd2l0aCBzcGFjZQ0KICByZXZpZXdzIDwtIGdzdWIoIltbOmNudHJsOl1dIiwgIiAiLCByZXZpZXdzKSAgIyByZXBsYWNlIGNvbnRyb2wgY2hhcmFjdGVycyB3aXRoIHNwYWNlDQogIHJldmlld3MgPC0gZ3N1YigiW1s6ZGlnaXQ6XV0iLCIgIiwgcmV2aWV3cykgIyByZW1vdmUgbnVtYmVyDQogIHJldmlld3MgPC0gZ3N1YigiXltbOnNwYWNlOl1dKyIsICIiLCByZXZpZXdzKSAjIHJlbW92ZSB3aGl0ZXNwYWNlIGF0IGJlZ2lubmluZyBvZiBkb2N1bWVudHMNCiAgcmV2aWV3cyA8LSBnc3ViKCJbWzpzcGFjZTpdXSskIiwgIiIsIHJldmlld3MpICMgcmVtb3ZlIHdoaXRlc3BhY2UgYXQgZW5kIG9mIGRvY3VtZW50cw0KICByZXZpZXdzIDwtIHRvbG93ZXIocmV2aWV3cykgICMgZm9yY2UgdG8gbG93ZXJjYXNlDQogIHJldmlld3MNCn0NCg0KDQoNCmBgYA0KDQojIyNTZW50aW1lbnQgYW5hbHlzaXMNCg0KYGBge3J9DQoNCiNpbnN0YWxsLnBhY2thZ2VzKCdzeXV6aGV0JykNCmxpYnJhcnkoc3l1emhldCkNClg9ZGF0YS5mcmFtZSgpDQoNCnNwbGl0KEFtYXpvbixBbWF6b24kdG9vbF9uYW1lKSAlPiUgc2FwcGx5KC4sZnVuY3Rpb24odil7DQogICAgICB0eHQ9Y2xlYW4odiRhcnRDb250ZW50KQ0KICAgICAgc2VudGkgPSBnZXRfbnJjX3NlbnRpbWVudCh0eHQpDQogICAgICBmID0gZGF0YS5mcmFtZSh0KGNvbFN1bXMoc2VudGkpKSkNCiAgICAgIFggPDwtIHJiaW5kKFgsZikNCiAgICAgIHJvd25hbWVzKFgpW25yb3coWCldIDw8LSB2JHRvb2xfbmFtZQ0KICAgICANCiAgfSkNCg0KYGBgDQoNCiMjIyNWaXN1YWxpemF0aW9uDQoNCmBgYHtyfQ0KDQojZGV2dG9vbHM6Omluc3RhbGxfZ2l0aHViKCJqYmt1bnN0L2hpZ2hjaGFydGVyIikNCmxpYnJhcnkoaGlnaGNoYXJ0ZXIpDQpsaWJyYXJ5KGRwbHlyKQ0KDQpwbWFwID0gZnVuY3Rpb24oZHgsc3QyPSIiKSB7ICMgcGVyY2VwdHVhbCBtYXANCiAgaGNoYXJ0KHByaW5jb21wKGR4KSxjb3I9VCkgJT4lDQogICAgaGNfY2hhcnQoem9vbVR5cGUgPSAieHkiKSAlPiUNCiAgICBoY19hZGRfdGhlbWUoaGNfdGhlbWVfZmxhdCgpKSAgJT4lDQogICAgaGNfdG9vbHRpcChoZWFkZXJGb3JtYXQ9IiIscG9pbnRGb3JtYXQ9IntzZXJpZXMubmFtZX0iKSAlPiUNCiAgICBoY190aXRsZSh0ZXh0PSJFbW90aW9ucyBvbiBCcmFuZHMgb2YgSGFtbWVycyIpICU+JQ0KICAgIGhjX3N1YnRpdGxlKHRleHQ9cGFzdGUwKCI5MyBicmFuZHMgIixzdDIpKX0NCg0KcG1hcChYWywxOjhdLyhyb3dTdW1zKFhbLDE6OF0pKzEpKQ0KI3NhdmUoWCxmaWxlPScwOTI5LlJkYXRhJykNCg0KYGBgDQoNCg0KDQojIyPlsLrluqbnuK7muJvlrZfoqZ7lhbHnj74NCg0KDQojIyMj5bu656uL5a2X6aC76KGoDQpgYGB7cn0NCg0KbGlicmFyeSh0bSkNCmxpYnJhcnkoc3RyaW5ncikNCmR0bSA9IEFtYXpvbiRhcnRDb250ZW50ICU+JSANCiAgaWNvbnYodG8gPSAidXRmLTgiLCBzdWI9IiIpICU+JSANCiAgc3RyX3RyaW0oLikgJT4lDQogIHRvdXBwZXIoKSAlPiUNCiAgVmVjdG9yU291cmNlICU+JSBDb3JwdXMgJT4lIA0KICB0bV9tYXAoY29udGVudF90cmFuc2Zvcm1lcih0b2xvd2VyKSkgJT4lIA0KICB0bV9tYXAocmVtb3ZlUHVuY3R1YXRpb24pICU+JSANCiAgdG1fbWFwKHN0ZW1Eb2N1bWVudCkgJT4lIA0KICBEb2N1bWVudFRlcm1NYXRyaXggJT4lIA0KICByZW1vdmVTcGFyc2VUZXJtcygwLjk5OCkNCmR0bSAgIyAoZG9jdW1lbnRzOiAxNDE1NiwgdGVybXM6IDEwMzApDQpkdG1fdG1wPWR0bQ0KYGBgDQoNCg0KIyMjI1RGLUlERiDpgY7mv74NCmBgYHtyfQ0KDQpsaWJyYXJ5KHNsYW0pDQp0ZmlkZiA9IHRhcHBseShkdG0kdi9yb3dfc3VtcyhkdG0pW2R0bSRpXSwgZHRtJGosIG1lYW4pICoNCiAgbG9nMihucm93KGR0bSkvY29sX3N1bXMoZHRtID4gMCkpDQpzdW1tYXJ5KHRmaWRmKQ0KDQpkdG09ZHRtX3RtcA0KZHRtID0gZHRtWywgdGZpZGYgPiAwLjI0IF0NCmR0bSA9IGR0bVssb3JkZXIoLWNvbF9zdW1zKGR0bSkpXQ0KZGltKGR0bSkNCmBgYA0KDQoNCiMjIyPkvb/nlKh0U05F5YGa5bC65bqm57iu5ribDQpgYGB7cn0NCg0KI2luc3RhbGwucGFja2FnZXMoJ1J0c25lJykNCmxpYnJhcnkoUnRzbmUpDQpuID0gNjM2DQp0c25lID0gZHRtWywgMTpuXSAlPiUgYXMuZGF0YS5mcmFtZS5tYXRyaXggJT4lIA0KICBzY2FsZSAlPiUgdCAlPiUgUnRzbmUoDQogICAgY2hlY2tfZHVwbGljYXRlcyA9IEZBTFNFLCB0aGV0YT0wLjAsIG1heF9pdGVyPTMyMDApDQoNCg0KYGBgDQoNCg0KIyMjI+WxpOe0muW8j+WIhue+pA0KYGBge3J9DQoNClkgPSB0c25lJFkgICAgICAgICAgICAgICMgdFNORSBjb29yZGluYXRlcw0KZCA9IGRpc3QoWSkgICAgICAgICAgICAgIyBkaXN0YW5jZSBtYXRyaXgNCmhjID0gaGNsdXN0KGQpICAgICAgICAgICMgaGktY2x1c3RlcmluZw0KSyA9IDEwMCAgICAgICAgICAgICAgICAjIG51bWJlciBvZiBjbHVzdGVycyANCmcgPSBjdXRyZWUoaGMsSykgICAgICAgICMgY3V0IGludG8gSyBjbHVzdGVycw0KdGFibGUoZykgJT4lIGFzLnZlY3RvciAlPiUgc29ydCAgICAgICAgICMgc2l6ZXMgb2YgY2x1c3RlcnMNCg0KDQpgYGANCg0KDQojIyMj5paH5a2X6ZuyDQpgYGB7cn0NCg0KIyBpbnN0YWxsLnBhY2thZ2VzKCdyYW5kb21jb2xvUicpDQpsaWJyYXJ5KHJhbmRvbWNvbG9SKQ0KbGlicmFyeSh3b3JkY2xvdWQpDQoNCndjID0gY29sX3N1bXMoZHRtWywxOm5dKQ0KY29sb3JzID0gZGlzdGluY3RDb2xvclBhbGV0dGUoSykNCg0KDQpwbmcoIi4vQW1hem9uMi5wbmciLCB3aWR0aD0zMjAwLCBoZWlnaHQ9MTgwMCkNCnRleHRwbG90KA0KICBZWywxXSwgWVssMl0sIGNvbG5hbWVzKGR0bSlbMTpuXSwgc2hvdz1GLCANCiAgY29sPWNvbG9yc1tnXSwNCiAgY2V4PSAwLjMgKyAxLjI1ICogc3FydCh3Yy9tZWFuKHdjKSksDQogIGZvbnQ9MikNCmRldi5vZmYoKQ0KYGBgDQoNCiFb5paH5a2X6ZuyXSguL0FtYXpvbjIucG5nKQ0KDQoNCjxicj48YnI+PGJyPjxicj48YnI+DQoNCjxzdHlsZT4NCmVtIHsNCiAgICBjb2xvcjogI0ZGRUE2QzsNCiAgICBiYWNrZ3JvdW5kOiAjN0Q3RDdEOw0KfQ0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KdGl0bGV7DQogIGNvbG9yOiAjY2MwMDAwOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KYm9keXsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmgxLGgyLGgzLGg0LGg1ew0KICBjb2xvcjogIzAwNjZmZjsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmgzew0KICBjb2xvcjogI2IzNmIwMDsNCiAgYmFja2dyb3VuZDogI2ZmZTBiMzsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmZmZmZTA7DQogIGxpbmUtaGVpZ2h0OiAyOw0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KDQo8L3N0eWxlPg0KDQoNCg==