Yelp12 Demo - 資料分析示範

rm(list=ls(all=T)); gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 487825 26.1     940480 50.3   592000 31.7
## Vcells 910440  7.0    1650153 12.6  1075672  8.3

library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)

library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)

options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
# options(width=100, digits=4, scipen=60)

Load Data

load("data/Biz.rdata")
load("data/Rev.rdata")
load("data/BCscores.rdata")

(1) 評論數量最多的100個商店

1.1 Top100 bid

b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid

1.2 Summarise reviews by bid and year

df = rev %>% filter(bid %in% b100) %>% 
  group_by(bid, year) %>% 
  summarise(
    positive = mean(positive),
    negative = mean(negative),
    sad = mean(sadness),
    n = n()
    )

1.3 Biz that have review every year after 2009

y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 10) %>% .$bid

1.4 Google Motion Chart

df %>% filter(year >= 2010 & year <= 2017 & bid %in% y09) %>% 
  data.frame %>% 
  gvisMotionChart("bid", "year") %>% plot

(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料

#
# This chunk should be executed in high-end server 
#
# library(doParallel)
# detectCores()
# cores <- makeCluster(4)
# registerDoParallel(cores)
# getDoParWorkers()
# t0 = Sys.time()
# cat936 = foreach(i=1:936, .combine=rbind, .packages="Matrix") %dopar% {
#   rx = rev[rev$bid %in% B$bid[X[,i]],]
#   cbind(
#     category = colnames(X)[i],
#     n = as.integer( table(rx$year) ),
#     aggregate(. ~ year, rx[,c(4:17,20,21)], mean)
#   ) }
# Sys.time() - t0         # 21.229 mins
# stopCluster(cores)
#
# CatYr = cat936 %>% mutate(
#   sentiment = positive - negative,
#   engagement = log(1 + cool + funny + useful)
#   ) %>%
#   group_by(year) %>% mutate(
#     z_sentiment = scale(sentiment),
#     z_engagement = scale(engagement)
#     ) %>% ungroup %>% data.frame
# save(CatYr, file="data/CatYr.rdata",compress=T)
#

load("data/CatYr.rdata")

2.2 比較討論聲量相似的商業類別

bizcats =  function(cats, yr) {
  CatYr %>% as_tibble %>%  
    filter(year >= yr) %>%
    filter(category %in% cats) %>%
    dplyr::select(
      category, year, 
      z_sentiment, stars, z_engagement, n, 
      engagement, sentiment, useful, cool, funny,
      positive, joy, anticipation, trust, surprise,
      negative, sadness, anger, disgust, fear
    )  %>% 
    data.frame %>% gvisMotionChart("category", "year") %>% plot()
}
# bizcats(colnames(X)[1:10], 2009)
bizcats(colnames(X)[11:40], 2009)

# bizcats(colnames(X)[41:70], 2009)
# bizcats(colnames(X)[71:100], 2009)

(3) “Beauty & Spas” 商業類別

3.1 設定商店評論選擇條件

選擇在

2011年以後每年都有評論
2011年以後評論的總數數量大於100、小於300
商業類別包含日本

的商店，將其bid放在japs裡面

spas = rev %>% 
  filter(bid %in% B$bid[X[,"Beauty & Spas"]]) %>% 
  filter(year >= 2011) %>% 
  group_by(bid) %>% 
  summarise(                      
    n_year = n_distinct(year),   # 
    n_rev = n()                  # 
  ) %>% 
  filter(
    n_year == 8,
    n_rev > 200 & n_rev < 500
  ) %>% 
  dplyr::select(bid) %>% 
  left_join(B)

Joining, by = "bid"

nrow(spas)

[1] 44

3.2 選出評論

選出這一些商店在2011年之後的所有評論，放在資料框R裡面

R = rev %>% filter(bid %in% spas$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")

(4) 聲量、星等、互動、情緒

par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")

4.1 商店間的動態比較

R %>% 
  filter(year >= 2014) %>% 
  filter(! bid %in% c(173534,63170)) %>% 
  group_by(bid, year) %>% 
  mutate(
    engage = log(1 + useful + cool + funny),
    senti = positive - negative
  ) %>% 
  summarise(
    engage = mean(engage),
    stars = mean(stars),
    senti = mean(senti),
    n = n() ) %>% 
  ungroup %>% group_by(year) %>% 
  mutate(
    z_senti = scale(senti),
    z_engage = scale(engage)
    ) %>% ungroup %>% 
  left_join(B[,c("bid","name")]) %>% data.frame -> df

Joining, by = "bid"

chain = table(df$name) %>% .[. > 5] %>% names
i = df$name %in% chain 
df$name[i] = paste(df$name[i], df$bid[i])

df = df[,c("name", "year", "senti", "stars", "engage", "n", 
           "z_senti","z_engage","bid")] 
df %>% gvisMotionChart("name", "year") %>% plot()

5.3 星等上升、下降最快的商店

使用線性回歸找出星等上升、下降最快的商店

library(stringr)
bx = sapply(split(df, df$bid), function(x) {
  lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>%  # take lm coef.
  .[ abs(.) > 0.15 ] %>%                              # 
  names %>% str_extract("[0-9]*") %>% 
  as.integer

df %>% filter(bid %in% bx) %>% dplyr::select(-bid) %>% 
  gvisMotionChart("name", "year") %>% plot()

(5) 取出評論和話題權重

load("data/Tmx.rdata")
empath = cbind(R[,"rid"], as.data.frame.matrix(Tmx[R$rid,]))
rm(Tmx); gc()

            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   3596342  192.1    5684620  303.6   5684620  303.6
Vcells 144112706 1099.5  437548602 3338.3 370773824 2828.8

(6) 商店與話題

6.1 商店話題矩陣與熱圖

# Make biz-theme matrix - `mx`
mx = sapply(split(empath[,-1], R$bid), colMeans) %>%  t    # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames

# Select themes with medium-high weights
rx = colSums(mx) %>% quantile(c(0.5, 1.0))            #     
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]]  #

# Check & adjust the range of weights
par(cex=0.8)      
hist(log(mx+1e-4))

6.2 D3heatmap

library(d3heatmap)
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=cm.colors(17))

6.3 熱圖 + 互動式集群分析

這個工具不能投射在網頁上，只能直接在Rstudio裡面做

# library(morpheus)
# mx %>% {log(.+1e-4)} %>% morpheus

(7) 商店的情緒分佈

將商店投射到尺度縮減之後的情緒平面上 ##### 7.1 建立商店情緒矩陣

pcx = sapply(split(R[,8:17], R$bid), colMeans) %>% t
rownames(pcx) = bnames

7.2 尺度縮減

library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F)

par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

前三個主成分已經涵蓋了90%的變異

7.3 情緒平面上的商店

source("bipcx.R")

N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")

bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")

(8) 文字分析與字雲

load("data/Txt.rdata")
txt = Txt[R$rid]
rm(Txt); gc()

            used (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   3668992  196    9968622  532.4   9664284  516.2
Vcells 151662250 1157  504196788 3846.8 615038923 4692.4

8.1 建立字頻表 (文件字詞矩陣)

# Sys.setlocale("LC_ALL","C")
library(tm)
dtm = txt %>% 
  iconv(to = "utf-8", sub="") %>% 
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.998)
dtm  # (documents: 12729, terms: 3145)

<<DocumentTermMatrix (documents: 12729, terms: 2479)>>
Non-/sparse entries: 612069/30943122
Sparsity           : 98%
Maximal term length: 13
Weighting          : term frequency (tf)

8.2 使用`TF-IDF`篩選字詞

library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0380  0.0897  0.1100  0.1204  0.1353  0.5020

dtm = dtm[, tfidf > 0.0897 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)

[1] 12729  1859

8.3 使用`tSNE`做尺度縮減

library(Rtsne)
n = 1000
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
  scale %>% t %>% 
  Rtsne(check_dup=F, theta=0.0, max_iter=3000)

8.4 層級式集群分析

Y = tsne$Y              # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 100                 # number of clusters 
g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort         # sizes of clusters

  [1]  2  2  2  2  2  2  2  3  3  3  4  4  4  5  5  5  5  6  6  6  6  6  6  6  6  6  7  7  7  7  7
 [32]  7  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9 10 10 10 10 10 10 11 11 11
 [63] 11 11 11 12 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 15 15 15 15 15 16 17 17 17 18
 [94] 19 19 20 20 21 24 24

8.5 文字雲

wc = col_sums(dtm[,1:n])
sz = 0.15 + sqrt(wc/mean(wc))
range(sz)

[1] 0.6989 6.3995

library(randomcoloR)
library(wordcloud)

colors = distinctColorPalette(K)
png("fig/spas2.png", width=3200, height=1800)
textplot(
  Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
  col=colors[g],
  cex= sz,
  font=2)
dev.off()

png 
  2

Yelp12 Demo - 資料分析示範

卓雍然 D994010001

2018-09-09 02:12:09

Load Data

(1) 評論數量最多的100個商店

1.1 Top100 bid

1.2 Summarise reviews by bid and year

1.3 Biz that have review every year after 2009

1.4 Google Motion Chart

(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料

2.2 比較討論聲量相似的商業類別

(3) “Beauty & Spas” 商業類別

3.1 設定商店評論選擇條件

3.2 選出評論

(4) 聲量、星等、互動、情緒

4.1 商店間的動態比較

5.3 星等上升、下降最快的商店

(5) 取出評論和話題權重

(6) 商店與話題

6.1 商店話題矩陣與熱圖

6.2 D3heatmap

6.3 熱圖 + 互動式集群分析

(7) 商店的情緒分佈

7.2 尺度縮減

7.3 情緒平面上的商店

(8) 文字分析與字雲

8.1 建立字頻表 (文件字詞矩陣)

8.2 使用`TF-IDF`篩選字詞

8.3 使用`tSNE`做尺度縮減

8.4 層級式集群分析

8.5 文字雲

Yelp12 Demo - 資料分析示範

卓雍然 D994010001

2018-09-09 02:12:09

Load Data

(1) 評論數量最多的100個商店

1.1 Top100 bid

1.2 Summarise reviews by bid and year

1.3 Biz that have review every year after 2009

1.4 Google Motion Chart

(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料

2.2 比較討論聲量相似的商業類別

(3) “Beauty & Spas” 商業類別

3.1 設定商店評論選擇條件

3.2 選出評論

(4) 聲量、星等、互動、情緒

4.1 商店間的動態比較

5.3 星等上升、下降最快的商店

(5) 取出評論和話題權重

(6) 商店與話題

6.1 商店話題矩陣與熱圖

6.2 D3heatmap

6.3 熱圖 + 互動式集群分析

(7) 商店的情緒分佈

7.2 尺度縮減

7.3 情緒平面上的商店

(8) 文字分析與字雲

8.1 建立字頻表 (文件字詞矩陣)

8.2 使用TF-IDF篩選字詞

8.3 使用tSNE做尺度縮減

8.4 層級式集群分析

8.5 文字雲

8.2 使用`TF-IDF`篩選字詞

8.3 使用`tSNE`做尺度縮減