rm(list=ls(all=T)); gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 487825 26.1     940480 50.3   592000 31.7
## Vcells 910440  7.0    1650153 12.6  1075672  8.3
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)

library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)

options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
# options(width=100, digits=4, scipen=60)
Load Data
load("data/Biz.rdata")
load("data/Rev.rdata")
load("data/BCscores.rdata")

(1) 評論數量最多的100個商店

1.1 Top100 bid
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid
1.2 Summarise reviews by bid and year
df = rev %>% filter(bid %in% b100) %>% 
  group_by(bid, year) %>% 
  summarise(
    positive = mean(positive),
    negative = mean(negative),
    sad = mean(sadness),
    n = n()
    )
1.3 Biz that have review every year after 2009
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 10) %>% .$bid
1.4 Google Motion Chart
df %>% filter(year >= 2010 & year <= 2017 & bid %in% y09) %>% 
  data.frame %>% 
  gvisMotionChart("bid", "year") %>% plot


(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料
#
# This chunk should be executed in high-end server 
#
# library(doParallel)
# detectCores()
# cores <- makeCluster(4)
# registerDoParallel(cores)
# getDoParWorkers()
# t0 = Sys.time()
# cat936 = foreach(i=1:936, .combine=rbind, .packages="Matrix") %dopar% {
#   rx = rev[rev$bid %in% B$bid[X[,i]],]
#   cbind(
#     category = colnames(X)[i],
#     n = as.integer( table(rx$year) ),
#     aggregate(. ~ year, rx[,c(4:17,20,21)], mean)
#   ) }
# Sys.time() - t0         # 21.229 mins
# stopCluster(cores)
#
# CatYr = cat936 %>% mutate(
#   sentiment = positive - negative,
#   engagement = log(1 + cool + funny + useful)
#   ) %>%
#   group_by(year) %>% mutate(
#     z_sentiment = scale(sentiment),
#     z_engagement = scale(engagement)
#     ) %>% ungroup %>% data.frame
# save(CatYr, file="data/CatYr.rdata",compress=T)
#
load("data/CatYr.rdata")
2.2 比較討論聲量相似的商業類別
bizcats =  function(cats, yr) {
  CatYr %>% as_tibble %>%  
    filter(year >= yr) %>%
    filter(category %in% cats) %>%
    dplyr::select(
      category, year, 
      z_sentiment, stars, z_engagement, n, 
      engagement, sentiment, useful, cool, funny,
      positive, joy, anticipation, trust, surprise,
      negative, sadness, anger, disgust, fear
    )  %>% 
    data.frame %>% gvisMotionChart("category", "year") %>% plot()
}
# bizcats(colnames(X)[1:10], 2009)
bizcats(colnames(X)[11:40], 2009)
# bizcats(colnames(X)[41:70], 2009)
# bizcats(colnames(X)[71:100], 2009)


(3) “Beauty & Spas” 商業類別

3.1 設定商店評論選擇條件

選擇在

  • 2011年以後每年都有評論
  • 2011年以後評論的總數數量大於100、小於300
  • 商業類別包含日本

的商店,將其bid放在japs裡面

spas = rev %>% 
  filter(bid %in% B$bid[X[,"Beauty & Spas"]]) %>% 
  filter(year >= 2011) %>% 
  group_by(bid) %>% 
  summarise(                      
    n_year = n_distinct(year),   # 
    n_rev = n()                  # 
  ) %>% 
  filter(
    n_year == 8,
    n_rev > 200 & n_rev < 500
  ) %>% 
  dplyr::select(bid) %>% 
  left_join(B)
Joining, by = "bid"
nrow(spas)
[1] 44
3.2 選出評論

選出這一些商店在2011年之後的所有評論,放在資料框R裡面

R = rev %>% filter(bid %in% spas$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")


(4) 聲量、星等、互動、情緒

par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")


4.1 商店間的動態比較
R %>% 
  filter(year >= 2014) %>% 
  filter(! bid %in% c(173534,63170)) %>% 
  group_by(bid, year) %>% 
  mutate(
    engage = log(1 + useful + cool + funny),
    senti = positive - negative
  ) %>% 
  summarise(
    engage = mean(engage),
    stars = mean(stars),
    senti = mean(senti),
    n = n() ) %>% 
  ungroup %>% group_by(year) %>% 
  mutate(
    z_senti = scale(senti),
    z_engage = scale(engage)
    ) %>% ungroup %>% 
  left_join(B[,c("bid","name")]) %>% data.frame -> df
Joining, by = "bid"
chain = table(df$name) %>% .[. > 5] %>% names
i = df$name %in% chain 
df$name[i] = paste(df$name[i], df$bid[i])

df = df[,c("name", "year", "senti", "stars", "engage", "n", 
           "z_senti","z_engage","bid")] 
df %>% gvisMotionChart("name", "year") %>% plot()
5.3 星等上升、下降最快的商店

使用線性回歸找出星等上升、下降最快的商店

library(stringr)
bx = sapply(split(df, df$bid), function(x) {
  lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>%  # take lm coef.
  .[ abs(.) > 0.15 ] %>%                              # 
  names %>% str_extract("[0-9]*") %>% 
  as.integer

df %>% filter(bid %in% bx) %>% dplyr::select(-bid) %>% 
  gvisMotionChart("name", "year") %>% plot()


(5) 取出評論和話題權重

load("data/Tmx.rdata")
empath = cbind(R[,"rid"], as.data.frame.matrix(Tmx[R$rid,]))
rm(Tmx); gc()
            used   (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   3596342  192.1    5684620  303.6   5684620  303.6
Vcells 144112706 1099.5  437548602 3338.3 370773824 2828.8

(6) 商店與話題

6.1 商店話題矩陣與熱圖
# Make biz-theme matrix - `mx`
mx = sapply(split(empath[,-1], R$bid), colMeans) %>%  t    # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames

# Select themes with medium-high weights
rx = colSums(mx) %>% quantile(c(0.5, 1.0))            #     
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]]  #

# Check & adjust the range of weights
par(cex=0.8)      
hist(log(mx+1e-4))

6.2 D3heatmap
library(d3heatmap)
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=cm.colors(17))
6.3 熱圖 + 互動式集群分析

這個工具不能投射在網頁上,只能直接在Rstudio裡面做

# library(morpheus)
# mx %>% {log(.+1e-4)} %>% morpheus


(7) 商店的情緒分佈

將商店投射到尺度縮減之後的情緒平面上 ##### 7.1 建立商店情緒矩陣

pcx = sapply(split(R[,8:17], R$bid), colMeans) %>% t
rownames(pcx) = bnames
7.2 尺度縮減
library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

前三個主成分已經涵蓋了90%的變異

7.3 情緒平面上的商店
source("bipcx.R")

N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")
bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")


(8) 文字分析與字雲

load("data/Txt.rdata")
txt = Txt[R$rid]
rm(Txt); gc()
            used (Mb) gc trigger   (Mb)  max used   (Mb)
Ncells   3668992  196    9968622  532.4   9664284  516.2
Vcells 151662250 1157  504196788 3846.8 615038923 4692.4
8.1 建立字頻表 (文件字詞矩陣)
# Sys.setlocale("LC_ALL","C")
library(tm)
dtm = txt %>% 
  iconv(to = "utf-8", sub="") %>% 
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.998)
dtm  # (documents: 12729, terms: 3145)
<<DocumentTermMatrix (documents: 12729, terms: 2479)>>
Non-/sparse entries: 612069/30943122
Sparsity           : 98%
Maximal term length: 13
Weighting          : term frequency (tf)
8.2 使用TF-IDF篩選字詞
library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0380  0.0897  0.1100  0.1204  0.1353  0.5020 
dtm = dtm[, tfidf > 0.0897 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
[1] 12729  1859
8.3 使用tSNE做尺度縮減
library(Rtsne)
n = 1000
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
  scale %>% t %>% 
  Rtsne(check_dup=F, theta=0.0, max_iter=3000)
8.4 層級式集群分析
Y = tsne$Y              # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 100                 # number of clusters 
g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort         # sizes of clusters
  [1]  2  2  2  2  2  2  2  3  3  3  4  4  4  5  5  5  5  6  6  6  6  6  6  6  6  6  7  7  7  7  7
 [32]  7  8  8  8  8  8  8  8  8  8  8  8  8  8  9  9  9  9  9  9  9  9 10 10 10 10 10 10 11 11 11
 [63] 11 11 11 12 13 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 15 15 15 15 15 16 17 17 17 18
 [94] 19 19 20 20 21 24 24
8.5 文字雲
wc = col_sums(dtm[,1:n])
sz = 0.15 + sqrt(wc/mean(wc))
range(sz)
[1] 0.6989 6.3995
library(randomcoloR)
library(wordcloud)

colors = distinctColorPalette(K)
png("fig/spas2.png", width=3200, height=1800)
textplot(
  Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
  col=colors[g],
  cex= sz,
  font=2)
dev.off()
png 
  2