AS12 YelpExplore - 資料探索與清理

rm(list=ls(all=T))
options(digits=4, scipen=40)
library(dplyr)

Load Yelp10 Data

ypath = "/home/tonychuo/_S2018/yelp10/data/" 
load(paste0(ypath, "yelp10.rdata"))

LOAD = FALSE
if(LOAD) load("data/tsne.rdata")

(1) `rev` - 評論資料框

評論資料框的表頭

head(rev)

# A tibble: 6 x 9
   cool date       funny stars useful   rid nchar    bid    uid
  <int> <date>     <int> <int>  <int> <int> <int>  <int>  <int>
1     0 2015-10-24     0     5      0     1   111 114595 841658
2     1 2013-02-01     0     4      2     2   408 114595   5992
3     0 2011-12-29     0     4      0     3   669 114595  93712
4     0 2015-04-16     0     3      0     4    21 114595 844656
5     0 2017-02-26     0     5      0     5    90 114595 249488
6     0 2014-12-24     0     5      0     6   108 114595 842819

1.1 Quick Check

評論人數、商店數、評論長度

n_distinct(rev$uid)    # no. user = 1183361

[1] 1183361

n_distinct(rev$bid)    # no. biz =   156637

[1] 156637

range(rev$nchar)       # length (no. char.) of review

[1]    1 5000

par(cex=0.8, mfrow=c(1,3), mar=c(7,5,4,2))
hist(rev$date,"years", las=2, main="No. Reviews", freq=T, xlab="", ylab="")
table(rev$stars) %>% barplot(main="No. Stars")
hist(rev$nchar, main="No.Characters")

# average scores
rev$year = format(rev$date, "%Y") %>% as.integer
df = aggregate(cbind(stars,cool,funny,useful) ~  year, data = rev, FUN = mean)

par(cex=0.8, mfrow=c(1,4), mar=c(3,4,4,1))
mapply(barplot, df[2:5], main=names(df)[2:5], las=2)

      stars cool funny useful
 [1,]   0.7  0.7   0.7    0.7
 [2,]   1.9  1.9   1.9    1.9
 [3,]   3.1  3.1   3.1    3.1
 [4,]   4.3  4.3   4.3    4.3
 [5,]   5.5  5.5   5.5    5.5
 [6,]   6.7  6.7   6.7    6.7
 [7,]   7.9  7.9   7.9    7.9
 [8,]   9.1  9.1   9.1    9.1
 [9,]  10.3 10.3  10.3   10.3
[10,]  11.5 11.5  11.5   11.5
[11,]  12.7 12.7  12.7   12.7
[12,]  13.9 13.9  13.9   13.9
[13,]  15.1 15.1  15.1   15.1
[14,]  16.3 16.3  16.3   16.3

(2) `user` - 評論人資料框

user = rev %>% group_by(uid) %>% summarise(
  n = n(),
  star = mean(stars),
  funny = mean(funny),
  useful = mean(useful),
  cool = mean(useful)
  )
range(user$uid)               # 1183362 ?

[1]       1 1183362

# there ia a missing user id = 1038356
setdiff(1:1183362, user$uid)

[1] 1038356

par(cex=0.8, mfrow=c(1,2), mar=c(5,5,4,2))
hist(log(user$n), main="No. Reviews per User (log)")
hist(user$star, main="Avg. Stars per User (log)")

par(cex=0.8, mfrow=c(1,3), mar=c(5,5,4,2))
hist(pmin(user$funny,10), main="Avg. Funny's per User")
hist(pmin(user$cool,10), main="Avg. Cool's per User")
hist(pmin(user$useful,10), main="Avg. Useful's per User")

(3) `X` - 商店類別矩陣 Biz-Category Matrix

每一個商店可能屬於很多個商業類別，所以商店和類別之間的關係需要用矩證的方式表示。

3.1 `mxBC` - full BC matrix, 1240 categories

library(Matrix)
dim(mxBC)        # 156261, 1240

[1] 156261   1240

table(mxBC@x)    # it should not be larger than 1 ?


     1      2 
590280      5

原始的商店類別矩陣mxBC需要先做一些清理工作，清理過之後，我們將各商業類別依其商店數的多寡降冪排列

# clean up
mxBC@x[mxBC@x == 2] = 1
mxBC =  mxBC[, order(-colSums(mxBC))]

大多數的商店都屬於多於一個類別

par(cex=0.8, mar=c(3,4,4,2))
rowSums(mxBC) %>% table %>% head(10) %>% barplot(main="No. Categoy per Biz")

各類別的商店數大致上是長尾分佈(power distribution)

par0 = par(cex=0.8, mar=c(6,12,3,2))
colSums(mxBC)[1:40] %>% rev %>% 
  barplot(horiz=T, las=2, main="Top 40 Category", xlab="No. Biz")

3.2 `X` - dense BC matrix, 822 categories

有一些商業類別的商店很少，我們決定只有下商店數大於20的商業類別

X = mxBC[,colSums(mxBC) > 20]
X = X[rowSums(X) > 0,]
dim(X)

[1] 156261    822

# there is a missing biz
setdiff(as.integer(rownames(X)), biz$bid)

[1] 14178

X  = X[rownames(X) != "14178", ]           # remove the missing row
setdiff(as.integer(rownames(X)), biz$bid)

integer(0)

dim(X)

[1] 156260    822

3.3 `B` - 商店基本資料

B = data.frame(bid = as.integer(rownames(X))) %>% left_join(biz)

Joining, by = "bid"

identical(B$bid, as.integer(rownames(X)))  # TRUE

[1] TRUE

head(B[, -c(2:3)])

  bid      city is_open latitude longitude                     name neighborhood postal_code
1   1   Phoenix   FALSE    33.45   -112.07              Chili's Too         <NA>       85073
2   2   Phoenix    TRUE    33.47   -112.20       Reparo Landscaping         <NA>       85035
3   3   Phoenix    TRUE    33.47   -112.26 AZ West Endoscopy Center         <NA>       85037
4   4 Pineville    TRUE    35.09    -80.89                 The Well         <NA>       28134
5   5     Tempe    TRUE    33.35   -111.96       Goodwill Store 095         <NA>       85284
6   6 Charlotte   FALSE    35.20    -80.84          The Italian Pie     Dilworth       28203
  review_count stars state
1           30   2.5    AZ
2            3   5.0    AZ
3            5   4.5    AZ
4            3   5.0    NC
5            5   3.0    AZ
6           27   3.0    NC

3.4 `C` - 商業類別摘要

C = apply(X, 2, function(i) c(sum(i), sum(B[i > 0,]$review_count)))
C = C %>% t %>% data.frame %>% setNames(c("n_biz", "n_rev")) %>% 
  mutate(a_rev = n_rev/n_biz)
C$name = colnames(X)

sapply(list(X=X, B=B, C=C), dim)

          X      B   C
[1,] 156260 156260 822
[2,]    822     13   4

(4a) 商業類別字雲 Category Word Cloud by Businesses

以下我們使用字雲觀察商業類別之間的相似性，使用tSNE，將X的尺度 [156,259 x 822] 縮減為 [2 x 822] …

library(RColorBrewer)
library(wordcloud)
library(Rtsne)

if(!LOAD) {
  t0 = Sys.time()
  set.seed(123)
  tsneCat = Rtsne(as.matrix(t(X)), check_duplicates=F, theta=0.0, max_iter=3000)
  Sys.time() - t0   # 2.769 mins
}

Time difference of 2.776 mins

在縮減尺度之中做階層式集群分析。 Clustering to set the color of words

Y = tsneCat$Y           # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 80                  # number of clusters 
C$group = g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort   # sizes of clusters

 [1]  3  3  3  3  4  4  4  4  5  5  5  5  6  6  6  6  6  6  6  7  7  7  7  7  7  8  8  8  8  8  8  8
[33]  9  9  9  9  9  9  9  9 10 10 10 10 10 10 10 10 11 11 11 12 12 13 13 13 13 14 14 14 14 14 14 14
[65] 14 14 14 14 15 15 16 16 16 17 18 20 20 21 22 23

Adjusting the range of C$n_rev (no. review) to set the fint size of words

range(-0.45 + log(C$n_rev)/5)

[1] 0.5092 2.5279

Plot word cloud to a .PNG file

png("fig/category.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], C$name, font=2, 
         col = randomcoloR::distinctColorPalette(K)[g],
         cex = -0.45 + log(C$n_rev)/5 ) # size by no. reviews
dev.off()

png 
  2

將字雲畫在category.png裡面：

每個字代表一個商業類別(Categories)
字的顏色代表商業類別群組(Category Groups)
字的大小代表這個商業類別被評論的次數 (number of reviews)
靠在一起的、同一種顏色的字，代表經常一起出現的商業類別

我們分別使用了尺度縮減和集群分析來做以上的字雲，其中 …
■ 尺度縮減的
● 原始尺度有多少個？
● 縮減之後剩下多少尺度？
● 原始尺度是什麼？換句話說，我們是根據甚麼來做尺度縮減？

■ 我們是根據什麼做的集群分析？
● 是原始尺度、還是縮減之後的尺度？
● 用原始和縮減尺度、會有什麼差別？

(5) 評論話題字雲 Theme Word Cloud

使用字雲觀察評論話題(Theme)之間的相似性

5.1 Average Sentiment & Empath scores per business

接下來考慮評論的話題，我們已經預先使用 Empath Text Classifier ，依其預設的194種內容(Class)，對這4,736,865篇評論分別做過評分，文集之中的每一篇評論都有194個內容評分，放在 /home/tonychuo/_S2018/yelp10/data/empath.rdata 裡面；由於資料太大，我們已經先依商店和評論人分別對話題評分做過平均。

load(paste0(ypath, "biz_se.rdata"))

biz_senti - average sentiment scores per biz

dim(biz_senti)

[1] 156638     11

biz_emapth - average empath scores per biz

dim(biz_empath)

[1] 156638    195

It appearss that one business’s is NA

is.na(biz_senti) %>% sum

[1] 0

is.na(biz_empath) %>% sum

[1] 194

5.2 `E` - 各商店的平均話題權重

E = data.frame(bid = B$bid) %>% left_join(biz_empath)

Joining, by = "bid"

identical(B$bid, as.integer(rownames(X)))

[1] TRUE

identical(E$bid, as.integer(rownames(X)))

[1] TRUE

i = which(is.na(E$help)); i

[1] 12230

E = E[-i,]; X = X[-i,]; B = B[-i,] 
rm(biz_empath); gc()

            used  (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   2253871 120.4    3886542  207.6   3886542  207.6
Vcells 107815810 822.6  680000431 5188.0 770394542 5877.7

rownames(E) = E$bid
E$bid = NULL
E = E[, order(-colSums(E))]

5.3 話題的討論強度 - Summerize Empath Scores across Biz

par(mar=c(6,4,4,2), mfrow=c(2,1), cex=0.7)
colSums(E[,1:20]) %>% barplot(main="Sums of Empath Scores, Top20 Themes", las=2)
colSums(E) %>% barplot(main="Sums of Empath Scores", las=2)

5.4 話題字雲 Theme Word Cloud

if(! LOAD) {
  t0 = Sys.time()
  set.seed(123)
  tsneTheme = E %>% scale %>% as.matrix %>% t %>% 
    Rtsne(check_duplicates=F, theta=0.0, max_iter=3000)
  Sys.time() - t0  # 23.69 secs
}

Time difference of 24.57 secs

Y = tsneTheme$Y         # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 40                  # number of clusters 
g = cutree(hc,K)                 # clustering for color
table(g) %>% as.vector %>% sort  # size of clusters

 [1]  2  2  2  2  2  2  3  3  3  3  3  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  5  6  6  6  6  7
[33]  7  7  7  8  8  8  9 10

range(log(colSums(E))/ 3 + 0.5)  # themes' weights for font size

[1] 0.7575 3.2098

png("fig/theme.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], colnames(E), font=2, 
         col = randomcoloR::distinctColorPalette(K)[g],           # color by group    
         cex = log(colSums(E))/2 + 0.5 ) # size by no. reviews
dev.off()

png 
  2

5.5 `theme` - 話題摘要

theme = data.frame(
  name = colnames(E),
  weight = colSums(E),
  group = g
  )

(6) 話題、類別的對應關係

6.1 `TC` - Theme-Category Matrix

先將評論話題與商業類別之間的關係整理成矩陣

library(d3heatmap)
TC = apply(X, 2, function(i) 100*colMeans(E[i > 0,]) )
dim(TC)

[1] 194 822

sapply(list(TC, colSums(TC), rowSums(TC)), range)

      [,1]  [,2]     [,3]
[1,] 0.000 33.32    1.142
[2,] 3.301 41.40 1789.675

然後用熱圖表現出評論話題與商業類別之間的關係

TC[1:70, 1:100] %>% t %>% scale %>% t %>% 
  d3heatmap(colors = cm.colors(20))

6.2 話題與類別群組 Theme-Category Group Mapping

用同樣的方法，我們也可以用熱圖來表現話題群組和商業類別群組之間的關係

x = sapply(1:max(C$group), function(i) rowSums(X[,C$group == i]) > 0)
x = apply(x, 2, function(i) 100 * colMeans(E[i,]))
x = sapply(1:max(theme$group), function(i) 
  colMeans(x[theme$group==i,] ) )
sapply(list(TC, colSums(TC), rowSums(TC)), range)

      [,1]  [,2]     [,3]
[1,] 0.000 33.32    1.142
[2,] 3.301 41.40 1789.675

x %>% scale %>% t %>% d3heatmap(
  scale="none", 
  colors = rev(brewer.pal(11,"Spectral")))

(7) 情緒與商業類別

7.1 `S` - 商店的平均情緒分數

S = data.frame(bid = B$bid) %>% left_join(biz_senti)

Joining, by = "bid"

rownames(S) = S$bid
S$bid = NULL
identical(B$bid, as.integer(rownames(S)))

[1] TRUE

is.na(S) %>% sum

[1] 0

range(S)

[1]  0.00 30.33

par(cex=0.8)
boxplot(S, las=2, main="Avg. Sentiment per Biz")

library(corrplot)

corrplot 0.84 loaded

par(cex=0.7)
corrplot.mixed(cor(S))

7.2 `CS` (商業類別x情緒) $ `CT` (商業類別x話題) 矩陣

將評論情緒評分(10項) 和評論主題評分(194項) 依商業類別(822類) 平均起來，分別放在：

CT [822 x 10]: 10 Average Sentiment Scores per business category
CS [822 x 194]: 194 Average Class Weights per business category

這兩個矩陣裡面：

CS = apply(X, 2, function(i) colMeans(S[i > 0,])) %>% t
CT = t(TC)
dim(CS); dim(CT)

[1] 822  10

[1] 822 194

7.3 情緒的主成份分析

先對情緒矩陣(sx)做主成份分析

library(FactoMineR)
library(factoextra)

Loading required package: ggplot2

Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(highcharter)

Highcharts (www.highcharts.com) is a Highsoft software product which is

not free for commercial and Governmental use

ncp=10  # number of components to keep
pcx = PCA(CS, ncp=ncp, graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

跟據上圖，前兩個主成份就涵蓋了80%的變異量。但是當我們想要將商業類別標示在前兩個主成份的平面上的時候 …

par(cex=0.7)
fviz_pca_biplot(pcx)

7.4 繪圖輔助工具

近兩年來R的畫圖套件幾乎都具備了輸出互動網頁的能力，以下我們先寫一個helper function，來幫助我們檢視主成份分析的結果。

source("bipcx.R")

7.5 前三個主成分

使用上面bipcx()這個function，我們可以清楚的看到商業類別(由於後面的商業類別評論數不多，我們只畫前400個類別)在第一、二主成份 …

bipcx(pcx,1,2,10,400,t1="Strength",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores",ratio=0.5)

第二、三主成份

bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores")

從以上的圖形我們可以辨識出來，第一、二、三主成份正好分別代表情緒的：

強度 (Strength)
正負值 (Valence)
激發程度 (Arousal)

(8) 討論話題與商業類別

話題矩陣的尺度(194)比情緒矩陣(10)大很多，即使我們只挑前400個商業類別和前80個內容項目 …

ncp=30
# only take large categories and large classes
pcx = PCA(CT[1:400,1:80],ncp=ncp,graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance", las=2)
abline(h=seq(0,100,10),col='lightgray')  # 12 PC's cover ~75% of variance

做完主成份分析之後，前11個主成份也只涵蓋50%的變異量。在這種資料點和尺度都很多的狀況之下，互動式的圖表更能幫助我們觀察到原始尺度和資料點之間的關係。

以下我們將前幾個主成份，以兩兩成對的方式，分別畫出在該平面上變異最大的12個話題和100個商業類別。在這些平面上，我們通常可以看到一些不容易從簡單的敘事統計看出來的關係。

bipcx(pcx,1,2,12,100,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)

bipcx(pcx,3,4,12,100,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 3 & 4")

bipcx(pcx,1,3,12,100,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 1 & 3")

bipcx(pcx,2,4,12,100,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 2 & 4")

Save the Results

# category = C
# save(B, X, E, S, bizatt, category, theme, file="data/businesses.rdata", compress=T)

save(tsneCat, tsneTheme, file="data/tsne.rdata", compress=T)

# save(rev, senti, file="data/reviews.rdata", compress=T)

AS12 YelpExplore - 資料探索與清理

商業類別(Category) vs 評論話題(Theme)

卓雍然 tonychuo@gmail.com

2018-08-27 20:08:11

Load Yelp10 Data

(1) `rev` - 評論資料框

1.1 Quick Check

(2) `user` - 評論人資料框

(3) `X` - 商店類別矩陣 Biz-Category Matrix

3.1 `mxBC` - full BC matrix, 1240 categories

3.2 `X` - dense BC matrix, 822 categories

3.3 `B` - 商店基本資料

3.4 `C` - 商業類別摘要

(4a) 商業類別字雲 Category Word Cloud by Businesses

(5) 評論話題字雲 Theme Word Cloud

5.1 Average Sentiment & Empath scores per business

5.2 `E` - 各商店的平均話題權重

5.3 話題的討論強度 - Summerize Empath Scores across Biz

5.4 話題字雲 Theme Word Cloud

5.5 `theme` - 話題摘要

(6) 話題、類別的對應關係

6.1 `TC` - Theme-Category Matrix

6.2 話題與類別群組 Theme-Category Group Mapping

(7) 情緒與商業類別

7.1 `S` - 商店的平均情緒分數

7.2 `CS` (商業類別x情緒) $ `CT` (商業類別x話題) 矩陣

7.3 情緒的主成份分析

7.4 繪圖輔助工具

7.5 前三個主成分

(8) 討論話題與商業類別

Save the Results

AS12 YelpExplore - 資料探索與清理

商業類別(Category) vs 評論話題(Theme)

卓雍然 tonychuo@gmail.com

2018-08-27 20:08:11

Load Yelp10 Data

(1) rev - 評論資料框

1.1 Quick Check

(2) user - 評論人資料框

(3) X - 商店類別矩陣 Biz-Category Matrix

3.1 mxBC - full BC matrix, 1240 categories

3.2 X - dense BC matrix, 822 categories

3.3 B - 商店基本資料

3.4 C - 商業類別摘要

(4a) 商業類別字雲 Category Word Cloud by Businesses

(5) 評論話題字雲 Theme Word Cloud

5.1 Average Sentiment & Empath scores per business

5.2 E - 各商店的平均話題權重

5.3 話題的討論強度 - Summerize Empath Scores across Biz

5.4 話題字雲 Theme Word Cloud

5.5 theme - 話題摘要

(6) 話題、類別的對應關係

6.1 TC - Theme-Category Matrix

6.2 話題與類別群組 Theme-Category Group Mapping

(7) 情緒與商業類別

7.1 S - 商店的平均情緒分數

7.2 CS (商業類別x情緒) $ CT (商業類別x話題) 矩陣

7.3 情緒的主成份分析

7.4 繪圖輔助工具

7.5 前三個主成分

(8) 討論話題與商業類別

Save the Results

(1) `rev` - 評論資料框

(2) `user` - 評論人資料框

(3) `X` - 商店類別矩陣 Biz-Category Matrix

3.1 `mxBC` - full BC matrix, 1240 categories

3.2 `X` - dense BC matrix, 822 categories

3.3 `B` - 商店基本資料

3.4 `C` - 商業類別摘要

5.2 `E` - 各商店的平均話題權重

5.5 `theme` - 話題摘要

6.1 `TC` - Theme-Category Matrix

7.1 `S` - 商店的平均情緒分數

7.2 `CS` (商業類別x情緒) $ `CT` (商業類別x話題) 矩陣