rm(list=ls(all=T))
options(digits=4, scipen=40)
library(dplyr)
Load Yelp10 Data
load("data/Biz.rdata")
load("data/Rev.rdata")
LOAD = TRUE
if(LOAD) { load("data/tsne.rdata") }

(1) rev - 評論資料框

評論資料框的表頭

head(rev)
  rid    bid       date stars cool funny useful anger anticipation disgust fear joy sadness
1   1 154938 2014-02-17     4    0     0      1     1            1       0    0   3       0
2   2  36983 2016-07-24     4    0     0      0     0            1       1    0   1       0
3   3 103253 2012-04-07     5    0     0      2     0            1       0    0   1       0
4   4  25101 2015-09-11     5    0     0      0     0            1       1    0   0       0
5   5 182709 2016-01-22     5    1     1      1     0            1       0    0   1       0
6   6 135863 2014-09-17     5    0     0      1     0            2       1    0   6       0
  surprise trust negative positive            business_id                user_id nchar year
1        0     3        1        3 Ue6-WhXvI-_1xUIuapl0zQ gVmUR8rqUFdbSeZbsg6z_w   322 2014
2        1     1        1        3 Ae4ABFarGMaI5lk1i98A0w Y6qylbHq8QJmaCRSlKdIog   137 2017
3        1     1        0        1 lKq4Qsz13FDcAVgp49uukQ SnXZkRN9Yf060pNTk1HMDg   108 2012
4        0     0        1        1 6nKR80xEGHYf2UxAe_Cu_g VcmSgvslHAhqWoEn16wjjw   113 2016
5        0     1        0        2 Z_mJYg3vi8cPZHa1J4BALw NKF9v-r0jd1p0JVi9h2T1w   200 2016
6        1     5        0        7 R1PQEK6qvrZVC9qcWfKvDA c2MQ_LPuvtiiKFR_-OY9pg   326 2015
1.1 Quick Check

評論人數、商店數、評論長度

n_distinct(rev$user_id)      # no. user = 1518169
[1] 1518169
n_distinct(rev$bid)          # no. biz =   188593
[1] 188593
sapply(c(5, 10, 50, 100), function(i) sum(rev$nchar <= i))
[1]     69    163   8630 177993
breaks = as.Date(c("2004-07-02", paste0(2005:2018, "-07-02")))
rev$year = as.integer(cut(rev$date, breaks)) + 2004
par(cex=0.8, mfrow=c(1,3), mar=c(7,5,4,2))
table(rev$year) %>% 
  barplot(las=2, main="#Reviews by Year(cut at Jul02)", 
          xlab="", ylab="")
table(rev$stars) %>% barplot(main="No. Stars")
hist(rev$nchar, main="No.Characters")

# average scores
df = aggregate(cbind(stars,cool,funny,useful) ~  year, data = rev, FUN = mean)
par(cex=0.8, mfrow=c(1,4), mar=c(3,4,4,1))
mapply(barplot, df[2:5], main=names(df)[2:5], las=2)

      stars cool funny useful
 [1,]   0.7  0.7   0.7    0.7
 [2,]   1.9  1.9   1.9    1.9
 [3,]   3.1  3.1   3.1    3.1
 [4,]   4.3  4.3   4.3    4.3
 [5,]   5.5  5.5   5.5    5.5
 [6,]   6.7  6.7   6.7    6.7
 [7,]   7.9  7.9   7.9    7.9
 [8,]   9.1  9.1   9.1    9.1
 [9,]  10.3 10.3  10.3   10.3
[10,]  11.5 11.5  11.5   11.5
[11,]  12.7 12.7  12.7   12.7
[12,]  13.9 13.9  13.9   13.9
[13,]  15.1 15.1  15.1   15.1
[14,]  16.3 16.3  16.3   16.3


(2) user - 評論人資料框

user = rev %>% group_by(user_id) %>% summarise(
  n = n(),
  star = mean(stars),
  funny = mean(funny),
  useful = mean(useful),
  cool = mean(useful)
  )
save(user, file="data/User.rdata", compress=T)
par(cex=0.8, mfrow=c(1,2), mar=c(5,5,4,2))
hist(log(user$n), main="No. Reviews per User (log)")
hist(user$star, main="Avg. Stars per User (log)")

par(cex=0.8, mfrow=c(1,3), mar=c(5,5,4,2))
hist(pmin(user$funny,10), main="Avg. Funny's per User")
hist(pmin(user$cool,10), main="Avg. Cool's per User")
hist(pmin(user$useful,10), main="Avg. Useful's per User")


(3) X - 商店類別矩陣 Biz-Category Matrix

3.1 X - BC matrix, 1306 categories

每一個商店可能屬於很多個商業類別,所以商店和類別之間的關係需要用矩證的方式表示。

dim(X)
Loading required package: Matrix
NULL

大多數的商店都屬於多於一個類別

par(cex=0.8, mar=c(3,4,4,2))
rowSums(X) %>% table %>% head(10) %>% barplot(main="No. Categoy per Biz")

各類別的商店數大致上是長尾分佈(power distribution)

par0 = par(cex=0.7, mar=c(11,4.5,3,0))
colSums(X)[1:40] %>% barplot(las=2, main="Top 40 Biz Category")

3.2 X - dense BC matrix, 822 categories

有一些商業類別的商店很少,我們決定只留下商店數大於20的商業類別

X = X[,colSums(X) > 20]
dim(X)                   # 188593    936
[1] 188593    936
identical(B$business_id, rownames(X))  # TRUE
[1] TRUE
3.3 C - 商業類別摘要
C = apply(X, 2, function(v) c(sum(v), sum(B[v,]$review_count)))
C = C %>% t %>% data.frame %>% setNames(c("n_biz", "n_rev")) %>% 
  mutate(a_rev = n_rev/n_biz)
C$name = colnames(X)
sapply(list(X=X, B=B, C=C), dim)
          X      B   C
[1,] 188593 188593 936
[2,]    936     80   4


(4) 商業類別字雲 Category Word Cloud by Businesses

以下我們使用字雲觀察商業類別之間的相似性, 使用tSNE,將X的尺度 [188593 x 936] 縮減為 [2 x 936] …

library(RColorBrewer)
library(wordcloud)
library(Rtsne)

if(!LOAD) {
  t0 = Sys.time()
  set.seed(123)
  tsneCat = Rtsne(as.matrix(t(X)), check_duplicates=F, theta=0.0, max_iter=3000)
  Sys.time() - t0   # 3.857 mins
  } 

在縮減尺度之中做階層式集群分析。 Clustering to set the color of words

Y = tsneCat$Y           # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 80                  # number of clusters 
C$group = g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort   # sizes of clusters
 [1]  2  2  4  4  4  4  4  4  5  6  6  6  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  9  9  9  9
[33]  9 10 10 10 10 10 10 10 11 11 11 11 11 11 11 12 12 13 13 13 14 14 14 15 15 15 15 15 15 16 16 16
[65] 16 17 17 17 17 18 18 18 20 20 21 23 23 24 28 31

Adjusting the range of C$n_rev (no. review) to set the fint size of words

sz = 0.7 + sqrt(C$n_rev)/500
range(sz)  
[1] 0.7211 4.5235

Plot word cloud to a .PNG file

png("fig/category.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], C$name, font=2, 
         col = randomcoloR::distinctColorPalette(K)[g],
         cex = sz ) # size by no. reviews
dev.off()
png 
  2 

將字雲畫在category.png裡面:


我們分別使用了尺度縮減和集群分析來做以上的字雲,其中 …
  ■ 尺度縮減的
    ● 原始尺度有多少個?
    ● 縮減之後剩下多少尺度?
    ● 原始尺度是什麼?換句話說,我們是根據甚麼來做尺度縮減?

  ■ 我們是根據什麼做的集群分析?
    ● 是原始尺度、還是縮減之後的尺度?
    ● 用原始和縮減尺度、會有什麼差別?



(5) 評論話題字雲 Theme Word Cloud

使用字雲觀察評論話題(Theme)之間的相似性

5.1 Average Sentiment & Empath scores per business

接下來考慮評論的話題,我們已經預先使用Stanford的Empath Text Classifier,依其預設的194種內容(Class), 對這5,996,996篇評論分別做過評分,文集之中的每一篇評論都有194個內容評分,放在 data/Tmx.rdata 裡面;由於資料太大,我們先依商店和評論人分別對話題全做過平均。

load("data/Tmx.rdata")

平均情緒評分、平均情緒評分

if(LOAD) { load("data/BCscores") } else 
{
  t0 = Sys.time()
  cat_senti = apply(X, 2, function(v) colMeans( rev[rev$bid %in% B$bid[v], 8:17] ) ) %>% t
  Sys.time() - t0
  biz_senti = aggregate(. ~ bid, data=rev[, c(2,8:17)], mean)
  Sys.time() - t0
  cat_theme = apply(X, 2, function(v) colMeans( Tmx[rev$bid %in% B$bid[v],] ) ) %>% t 
  Sys.time() - t0
  biz_theme = aggregate(as.data.frame.matrix(Tmx), list(bid = rev$bid), mean)
  Sys.time() - t0
  gc()
  save(biz_senti, biz_theme, cat_senti, cat_theme, file="data/BCscores", compress=T)
}

biz_senti - average sentiment scores per biz

dim(biz_senti)
[1] 188593     11

biz_theme - average theme weights per biz

dim(biz_theme)
[1] 188593    195
5.2 話題的討論強度 - Summerize Empath Scores across Biz
themes = data.frame(name=colnames(Tmx), weight=colSums(Tmx), stringsAsFactors=F)
par(mar=c(8,4,4,2), cex=0.7)
colSums(Tmx)[1:20] %>% barplot(main="Sums of Empath Scores, Top20 Themes", las=2)

par(mar=c(2,4,4,2), cex=0.7)
themes$weight %>% barplot(main="Sums of Empath Scores")

5.4 話題字雲 Theme Word Cloud
if(!LOAD) {
  t0 = Sys.time()
  set.seed(123)
  tsneTheme = biz_theme[,-1] %>% scale %>% as.matrix %>% t %>% 
    Rtsne(check_duplicates=F, theta=0.0, max_iter=3000)
  Sys.time() - t0  # 29.21 secs
}
Y = tsneTheme$Y         # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 40                  # number of clusters 
themes$group = g = cutree(hc,K)  # clustering for color
table(g) %>% as.vector %>% sort  # size of clusters
 [1] 2 2 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 7 7 7 7 7 8
sz = sqrt(themes$weight)/100 + 1.5
range(sz)   
[1] 1.591 5.122
png("fig/theme.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], themes$name, font=2, 
         col = randomcoloR::distinctColorPalette(K)[g],    # color by group    
         cex = sz )                                        # size by total weight
dev.off()
png 
  2 


(6) 話題、類別的對應關係

6.1 TC - Theme-Category Matrix

先將評論話題與商業類別之間的關係整理成矩陣

library(d3heatmap)
#TC = apply(X, 2, function(i) 100*colMeans(E[i > 0,]) )
#dim(TC) 
#sapply(list(TC, colSums(TC), rowSums(TC)), range)

然後用熱圖表現出評論話題與商業類別之間的關係

library(d3heatmap)
cat_theme[1:100, 4:80] %>% t %>% d3heatmap(colors = cm.colors(13)[3:13]) 
# rev(brewer.pal(11,"Spectral"))
# scale on the theme scores
cat_theme[1:350, 1:194] %>% scale %>% d3heatmap(
  show_grid=F, 
  xaxis_font_size = "0px", xaxis_height = 10,
  yaxis_font_size = "0px", yaxis_width = 10,
  colors = brewer.pal(9,"Greens")
  ) 
# library(morpheus)
# cat_theme[1:200, 4:100] %>% t %>% morpheus
6.2 話題與類別群組 Theme-Category Group Mapping

用同樣的方法,我們也可以用熱圖來表現話題群組和商業類別群組之間的關係

#x = sapply(1:max(C$group), function(i) rowSums(X[,C$group == i]) > 0)
#x = apply(x, 2, function(i) 100 * colMeans(E[i,]))
#x = sapply(1:max(theme$group), function(i) 
#  colMeans(x[theme$group==i,] ) )
#sapply(list(TC, colSums(TC), rowSums(TC)), range)
#x %>% scale %>% t %>% d3heatmap(
#  scale="none", 
#  colors = rev(brewer.pal(11,"Spectral"))) 


(7) 情緒與商業類別

7.1 biz_senti - 商店的平均情緒分數
S = biz_senti[,-1]
summary(S)
     anger        anticipation      disgust           fear             joy           sadness      
 Min.   :0.000   Min.   : 0.00   Min.   :0.000   Min.   : 0.000   Min.   : 0.00   Min.   : 0.000  
 1st Qu.:0.500   1st Qu.: 1.80   1st Qu.:0.333   1st Qu.: 0.500   1st Qu.: 1.73   1st Qu.: 0.582  
 Median :0.762   Median : 2.33   Median :0.623   Median : 0.821   Median : 2.43   Median : 0.898  
 Mean   :0.882   Mean   : 2.48   Mean   :0.703   Mean   : 0.965   Mean   : 2.54   Mean   : 1.024  
 3rd Qu.:1.143   3rd Qu.: 3.00   3rd Qu.:0.958   3rd Qu.: 1.250   3rd Qu.: 3.20   3rd Qu.: 1.333  
 Max.   :9.000   Max.   :13.33   Max.   :8.000   Max.   :13.667   Max.   :16.00   Max.   :10.667  
    surprise         trust          negative        positive    
 Min.   :0.000   Min.   : 0.00   Min.   : 0.00   Min.   : 0.00  
 1st Qu.:0.692   1st Qu.: 2.33   1st Qu.: 1.27   1st Qu.: 4.00  
 Median :1.000   Median : 3.00   Median : 1.88   Median : 5.00  
 Mean   :1.092   Mean   : 3.13   Mean   : 2.10   Mean   : 5.28  
 3rd Qu.:1.368   3rd Qu.: 3.73   3rd Qu.: 2.67   3rd Qu.: 6.29  
 Max.   :8.000   Max.   :19.33   Max.   :18.67   Max.   :31.00  
par(cex=0.8, mar=c(6,4,4,2))
boxplot(S, las=2, main="Avg. Sentiment per Biz")

library(corrplot)
corrplot 0.84 loaded
par(cex=0.7)
corrplot.mixed(cor(S))

7.2 CS (商業類別x情緒) $ CT (商業類別x話題) 矩陣

將 評論情緒評分(10項) 和 評論主題評分(194項) 依 商業類別(822類) 平均起來,分別放在:

  • CS [936 x 10]: 10 Average Sentiment Scores per business category
  • CT [936 x 194]: 194 Average Class Weights per business category

這兩個矩陣裡面:

CS = cat_senti
CT = cat_theme
dim(CS); dim(CT)
[1] 936  10
[1] 936 194
7.3 情緒的主成份分析

先對情緒矩陣(sx)做主成份分析

library(FactoMineR)
library(factoextra)
Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(highcharter)
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
ncp=10  # number of components to keep
pcx = PCA(CS, ncp=ncp, graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

跟據上圖,前兩個主成份就涵蓋了80%的變異量。 但是當我們想要將商業類別標示在前兩個主成份的平面上的時候 …

par(cex=0.7)
fviz_pca_biplot(pcx)

7.4 繪圖輔助工具

近兩年來R的畫圖套件幾乎都具備了輸出互動網頁的能力,以下我們先寫一個helper function,來幫助我們檢視主成份分析的結果。

source("bipcx.R")
7.5 前三個主成分

使用上面bipcx()這個function,我們可以清楚的看到商業類別(由於後面的商業類別評論數不多,我們只畫前400個類別)在第一、二主成份 …

bipcx(pcx,1,2,10,400,t1="Strength",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores",ratio=0.5)

第二、三主成份

bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores")

從以上的圖形我們可以辨識出來,第一、二、三主成份正好分別代表情緒的:

  • 強度 (Strength)
  • 正負值 (Valence)
  • 激發程度 (Arousal)


(8) 討論話題與商業類別

話題矩陣的尺度(194)比情緒矩陣(10)大很多,即使我們只挑前400個商業類別和前50個內容項目 …

ncp=30
# only take large categories and large classes
pcx = PCA(CT[1:600, 1:50],ncp=ncp,graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance", las=2)
abline(h=seq(0,100,10),col='lightgray')  # 12 PC's cover ~75% of variance

做完主成份分析之後,前4個主成份也只涵蓋60%的變異量。 在這種資料點和尺度都很多的狀況之下,互動式的圖表更能幫助我們觀察到 原始尺度和資料點之間的關係。

以下我們將前幾個主成份,以兩兩成對的方式, 分別畫出在該平面上變異最大的20個話題和200個商業類別。 在這些平面上,我們通常可以看到一些不容易從簡單的敘事統計看出來的關係。

bipcx(pcx,1,2,20,200,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)
bipcx(pcx,3,4,20,200,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 3 & 4")
bipcx(pcx,1,3,20,200,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 1 & 3")
bipcx(pcx,2,4,20,200,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 2 & 4")
Save the Results
categories = C
save(biz_senti, biz_theme, cat_senti, cat_theme, categories, themes, 
     file="data/BCscores.rdata", compress=T)
save(tsneCat, tsneTheme, file="data/tsne.rdata", compress=T)