rm(list=ls(all=T))
options(digits=4, scipen=40)
library(dplyr)ypath = "/home/tonychuo/_S2018/yelp10/data/"
load(paste0(ypath, "yelp10.rdata"))
LOAD = FALSE
if(LOAD) load("data/tsne.rdata")rev - 評論資料框評論資料框的表頭
head(rev)# A tibble: 6 x 9
cool date funny stars useful rid nchar bid uid
<int> <date> <int> <int> <int> <int> <int> <int> <int>
1 0 2015-10-24 0 5 0 1 111 114595 841658
2 1 2013-02-01 0 4 2 2 408 114595 5992
3 0 2011-12-29 0 4 0 3 669 114595 93712
4 0 2015-04-16 0 3 0 4 21 114595 844656
5 0 2017-02-26 0 5 0 5 90 114595 249488
6 0 2014-12-24 0 5 0 6 108 114595 842819
評論人數、商店數、評論長度
n_distinct(rev$uid) # no. user = 1183361[1] 1183361
n_distinct(rev$bid) # no. biz = 156637[1] 156637
range(rev$nchar) # length (no. char.) of review [1] 1 5000
par(cex=0.8, mfrow=c(1,3), mar=c(7,5,4,2))
hist(rev$date,"years", las=2, main="No. Reviews", freq=T, xlab="", ylab="")
table(rev$stars) %>% barplot(main="No. Stars")
hist(rev$nchar, main="No.Characters")# average scores
rev$year = format(rev$date, "%Y") %>% as.integer
df = aggregate(cbind(stars,cool,funny,useful) ~ year, data = rev, FUN = mean)par(cex=0.8, mfrow=c(1,4), mar=c(3,4,4,1))
mapply(barplot, df[2:5], main=names(df)[2:5], las=2) stars cool funny useful
[1,] 0.7 0.7 0.7 0.7
[2,] 1.9 1.9 1.9 1.9
[3,] 3.1 3.1 3.1 3.1
[4,] 4.3 4.3 4.3 4.3
[5,] 5.5 5.5 5.5 5.5
[6,] 6.7 6.7 6.7 6.7
[7,] 7.9 7.9 7.9 7.9
[8,] 9.1 9.1 9.1 9.1
[9,] 10.3 10.3 10.3 10.3
[10,] 11.5 11.5 11.5 11.5
[11,] 12.7 12.7 12.7 12.7
[12,] 13.9 13.9 13.9 13.9
[13,] 15.1 15.1 15.1 15.1
[14,] 16.3 16.3 16.3 16.3
user - 評論人資料框user = rev %>% group_by(uid) %>% summarise(
n = n(),
star = mean(stars),
funny = mean(funny),
useful = mean(useful),
cool = mean(useful)
)
range(user$uid) # 1183362 ?[1] 1 1183362
# there ia a missing user id = 1038356
setdiff(1:1183362, user$uid) [1] 1038356
par(cex=0.8, mfrow=c(1,2), mar=c(5,5,4,2))
hist(log(user$n), main="No. Reviews per User (log)")
hist(user$star, main="Avg. Stars per User (log)")par(cex=0.8, mfrow=c(1,3), mar=c(5,5,4,2))
hist(pmin(user$funny,10), main="Avg. Funny's per User")
hist(pmin(user$cool,10), main="Avg. Cool's per User")
hist(pmin(user$useful,10), main="Avg. Useful's per User")X - 商店類別矩陣 Biz-Category Matrix每一個商店可能屬於很多個商業類別,所以商店和類別之間的關係需要用矩證的方式表示。
mxBC - full BC matrix, 1240 categorieslibrary(Matrix)
dim(mxBC) # 156261, 1240[1] 156261 1240
table(mxBC@x) # it should not be larger than 1 ?
1 2
590280 5
原始的商店類別矩陣mxBC需要先做一些清理工作,清理過之後,我們將各商業類別依其商店數的多寡降冪排列
# clean up
mxBC@x[mxBC@x == 2] = 1
mxBC = mxBC[, order(-colSums(mxBC))]大多數的商店都屬於多於一個類別
par(cex=0.8, mar=c(3,4,4,2))
rowSums(mxBC) %>% table %>% head(10) %>% barplot(main="No. Categoy per Biz")各類別的商店數大致上是長尾分佈(power distribution)
par0 = par(cex=0.8, mar=c(6,12,3,2))
colSums(mxBC)[1:40] %>% rev %>%
barplot(horiz=T, las=2, main="Top 40 Category", xlab="No. Biz")X - dense BC matrix, 822 categories有一些商業類別的商店很少,我們決定只有下商店數大於20的商業類別
X = mxBC[,colSums(mxBC) > 20]
X = X[rowSums(X) > 0,]
dim(X)[1] 156261 822
# there is a missing biz
setdiff(as.integer(rownames(X)), biz$bid)[1] 14178
X = X[rownames(X) != "14178", ] # remove the missing row
setdiff(as.integer(rownames(X)), biz$bid)integer(0)
dim(X)[1] 156260 822
B - 商店基本資料B = data.frame(bid = as.integer(rownames(X))) %>% left_join(biz)Joining, by = "bid"
identical(B$bid, as.integer(rownames(X))) # TRUE[1] TRUE
head(B[, -c(2:3)]) bid city is_open latitude longitude name neighborhood postal_code
1 1 Phoenix FALSE 33.45 -112.07 Chili's Too <NA> 85073
2 2 Phoenix TRUE 33.47 -112.20 Reparo Landscaping <NA> 85035
3 3 Phoenix TRUE 33.47 -112.26 AZ West Endoscopy Center <NA> 85037
4 4 Pineville TRUE 35.09 -80.89 The Well <NA> 28134
5 5 Tempe TRUE 33.35 -111.96 Goodwill Store 095 <NA> 85284
6 6 Charlotte FALSE 35.20 -80.84 The Italian Pie Dilworth 28203
review_count stars state
1 30 2.5 AZ
2 3 5.0 AZ
3 5 4.5 AZ
4 3 5.0 NC
5 5 3.0 AZ
6 27 3.0 NC
C - 商業類別摘要C = apply(X, 2, function(i) c(sum(i), sum(B[i > 0,]$review_count)))
C = C %>% t %>% data.frame %>% setNames(c("n_biz", "n_rev")) %>%
mutate(a_rev = n_rev/n_biz)
C$name = colnames(X)sapply(list(X=X, B=B, C=C), dim) X B C
[1,] 156260 156260 822
[2,] 822 13 4
以下我們使用字雲觀察商業類別之間的相似性, 使用tSNE,將X的尺度 [156,259 x 822] 縮減為 [2 x 822] …
library(RColorBrewer)
library(wordcloud)
library(Rtsne)
if(!LOAD) {
t0 = Sys.time()
set.seed(123)
tsneCat = Rtsne(as.matrix(t(X)), check_duplicates=F, theta=0.0, max_iter=3000)
Sys.time() - t0 # 2.769 mins
} Time difference of 2.776 mins
在縮減尺度之中做階層式集群分析。 Clustering to set the color of words
Y = tsneCat$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 80 # number of clusters
C$group = g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters [1] 3 3 3 3 4 4 4 4 5 5 5 5 6 6 6 6 6 6 6 7 7 7 7 7 7 8 8 8 8 8 8 8
[33] 9 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 11 11 11 12 12 13 13 13 13 14 14 14 14 14 14 14
[65] 14 14 14 14 15 15 16 16 16 17 18 20 20 21 22 23
Adjusting the range of C$n_rev (no. review) to set the fint size of words
range(-0.45 + log(C$n_rev)/5) [1] 0.5092 2.5279
Plot word cloud to a .PNG file
png("fig/category.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], C$name, font=2,
col = randomcoloR::distinctColorPalette(K)[g],
cex = -0.45 + log(C$n_rev)/5 ) # size by no. reviews
dev.off()png
2
將字雲畫在category.png裡面:
我們分別使用了尺度縮減和集群分析來做以上的字雲,其中 …
■ 尺度縮減的
● 原始尺度有多少個?
● 縮減之後剩下多少尺度?
● 原始尺度是什麼?換句話說,我們是根據甚麼來做尺度縮減?
■ 我們是根據什麼做的集群分析?
● 是原始尺度、還是縮減之後的尺度?
● 用原始和縮減尺度、會有什麼差別?
使用字雲觀察評論話題(Theme)之間的相似性
接下來考慮評論的話題,我們已經預先使用 Empath Text Classifier ,依其預設的194種內容(Class), 對這4,736,865篇評論分別做過評分,文集之中的每一篇評論都有194個內容評分,放在 /home/tonychuo/_S2018/yelp10/data/empath.rdata 裡面;由於資料太大,我們已經先依商店和評論人分別對話題評分做過平均。
load(paste0(ypath, "biz_se.rdata"))biz_senti - average sentiment scores per biz
dim(biz_senti)[1] 156638 11
biz_emapth - average empath scores per biz
dim(biz_empath)[1] 156638 195
It appearss that one business’s is NA
is.na(biz_senti) %>% sum[1] 0
is.na(biz_empath) %>% sum[1] 194
E - 各商店的平均話題權重E = data.frame(bid = B$bid) %>% left_join(biz_empath)Joining, by = "bid"
identical(B$bid, as.integer(rownames(X)))[1] TRUE
identical(E$bid, as.integer(rownames(X)))[1] TRUE
i = which(is.na(E$help)); i[1] 12230
E = E[-i,]; X = X[-i,]; B = B[-i,]
rm(biz_empath); gc() used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2253871 120.4 3886542 207.6 3886542 207.6
Vcells 107815810 822.6 680000431 5188.0 770394542 5877.7
rownames(E) = E$bid
E$bid = NULL
E = E[, order(-colSums(E))]par(mar=c(6,4,4,2), mfrow=c(2,1), cex=0.7)
colSums(E[,1:20]) %>% barplot(main="Sums of Empath Scores, Top20 Themes", las=2)
colSums(E) %>% barplot(main="Sums of Empath Scores", las=2)if(! LOAD) {
t0 = Sys.time()
set.seed(123)
tsneTheme = E %>% scale %>% as.matrix %>% t %>%
Rtsne(check_duplicates=F, theta=0.0, max_iter=3000)
Sys.time() - t0 # 23.69 secs
}Time difference of 24.57 secs
Y = tsneTheme$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 40 # number of clusters
g = cutree(hc,K) # clustering for color
table(g) %>% as.vector %>% sort # size of clusters [1] 2 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 6 6 6 6 7
[33] 7 7 7 8 8 8 9 10
range(log(colSums(E))/ 3 + 0.5) # themes' weights for font size [1] 0.7575 3.2098
png("fig/theme.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], colnames(E), font=2,
col = randomcoloR::distinctColorPalette(K)[g], # color by group
cex = log(colSums(E))/2 + 0.5 ) # size by no. reviews
dev.off()png
2
theme - 話題摘要theme = data.frame(
name = colnames(E),
weight = colSums(E),
group = g
) TC - Theme-Category Matrix先將評論話題與商業類別之間的關係整理成矩陣
library(d3heatmap)
TC = apply(X, 2, function(i) 100*colMeans(E[i > 0,]) )
dim(TC) [1] 194 822
sapply(list(TC, colSums(TC), rowSums(TC)), range) [,1] [,2] [,3]
[1,] 0.000 33.32 1.142
[2,] 3.301 41.40 1789.675
然後用熱圖表現出評論話題與商業類別之間的關係
TC[1:70, 1:100] %>% t %>% scale %>% t %>%
d3heatmap(colors = cm.colors(20)) 用同樣的方法,我們也可以用熱圖來表現話題群組和商業類別群組之間的關係
x = sapply(1:max(C$group), function(i) rowSums(X[,C$group == i]) > 0)
x = apply(x, 2, function(i) 100 * colMeans(E[i,]))
x = sapply(1:max(theme$group), function(i)
colMeans(x[theme$group==i,] ) )
sapply(list(TC, colSums(TC), rowSums(TC)), range) [,1] [,2] [,3]
[1,] 0.000 33.32 1.142
[2,] 3.301 41.40 1789.675
x %>% scale %>% t %>% d3heatmap(
scale="none",
colors = rev(brewer.pal(11,"Spectral"))) S - 商店的平均情緒分數S = data.frame(bid = B$bid) %>% left_join(biz_senti)Joining, by = "bid"
rownames(S) = S$bid
S$bid = NULL
identical(B$bid, as.integer(rownames(S)))[1] TRUE
is.na(S) %>% sum[1] 0
range(S)[1] 0.00 30.33
par(cex=0.8)
boxplot(S, las=2, main="Avg. Sentiment per Biz")library(corrplot)corrplot 0.84 loaded
par(cex=0.7)
corrplot.mixed(cor(S))CS (商業類別x情緒) $ CT (商業類別x話題) 矩陣將 評論情緒評分(10項) 和 評論主題評分(194項) 依 商業類別(822類) 平均起來,分別放在:
CT [822 x 10]: 10 Average Sentiment Scores per business categoryCS [822 x 194]: 194 Average Class Weights per business category這兩個矩陣裡面:
CS = apply(X, 2, function(i) colMeans(S[i > 0,])) %>% t
CT = t(TC)
dim(CS); dim(CT)[1] 822 10
[1] 822 194
先對情緒矩陣(sx)做主成份分析
library(FactoMineR)
library(factoextra)Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(highcharter)Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
ncp=10 # number of components to keep
pcx = PCA(CS, ncp=ncp, graph=F)
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')跟據上圖,前兩個主成份就涵蓋了80%的變異量。 但是當我們想要將商業類別標示在前兩個主成份的平面上的時候 …
par(cex=0.7)
fviz_pca_biplot(pcx)近兩年來R的畫圖套件幾乎都具備了輸出互動網頁的能力,以下我們先寫一個helper function,來幫助我們檢視主成份分析的結果。
source("bipcx.R")使用上面bipcx()這個function,我們可以清楚的看到商業類別(由於後面的商業類別評論數不多,我們只畫前400個類別)在第一、二主成份 …
bipcx(pcx,1,2,10,400,t1="Strength",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores",ratio=0.5)第二、三主成份
bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores")話題矩陣的尺度(194)比情緒矩陣(10)大很多,即使我們只挑前400個商業類別和前80個內容項目 …
ncp=30
# only take large categories and large classes
pcx = PCA(CT[1:400,1:80],ncp=ncp,graph=F)
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance", las=2)
abline(h=seq(0,100,10),col='lightgray') # 12 PC's cover ~75% of variance 做完主成份分析之後,前11個主成份也只涵蓋50%的變異量。 在這種資料點和尺度都很多的狀況之下,互動式的圖表更能幫助我們觀察到 原始尺度和資料點之間的關係。
以下我們將前幾個主成份,以兩兩成對的方式, 分別畫出在該平面上變異最大的12個話題和100個商業類別。 在這些平面上,我們通常可以看到一些不容易從簡單的敘事統計看出來的關係。
bipcx(pcx,1,2,12,100,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)bipcx(pcx,3,4,12,100,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 3 & 4")bipcx(pcx,1,3,12,100,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 1 & 3")bipcx(pcx,2,4,12,100,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 2 & 4")# category = C
# save(B, X, E, S, bizatt, category, theme, file="data/businesses.rdata", compress=T)
save(tsneCat, tsneTheme, file="data/tsne.rdata", compress=T)
# save(rev, senti, file="data/reviews.rdata", compress=T)