rm(list=ls(all=T)); gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 489419 26.2     940480 50.3   592000 31.7
## Vcells 914218  7.0    1650153 12.6  1070804  8.2
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)

library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)

options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
# options(width=100, digits=4, scipen=60)
Load Data
prep2 = "/home/tonychuo/_S2018/yelp10/prep2/"
load(paste0(prep2,"businesses.rdata"))
load(paste0(prep2,"reviews.rdata"))
Merge sentiment score into rev
rev = cbind(rev, senti[,-1])
rev$year =  as.integer(format(rev$date, "%Y"))
range(rev$date)
[1] "2004-07-22" "2017-07-26"

(1) 評論數量最多的100個商店

1.1 Top100 bid
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid
1.2 Summarise reviews by bid and year
df = rev %>% filter(bid %in% b100) %>% 
  group_by(bid, year) %>% 
  summarise(
    positive = mean(positive),
    negative = mean(negative),
    sad = mean(sadness),
    n = n()
    )
1.3 Biz that have review every year after 2009
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 9) %>% .$bid
1.4 Google Motion Chart
df %>% filter(year >= 2010 & year <= 2016 & bid %in% y09) %>% 
  data.frame %>% 
  gvisMotionChart("bid", "year") %>% plot


(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料
LOAD = TRUE
if(LOAD) { 
  load(paste0(prep2, "cat822.rdata"))
} else {

library(doParallel)
detectCores()
cores <- makeCluster(16)
registerDoParallel(cores)
getDoParWorkers()
t0 = Sys.time()
cat822 = foreach(i=1:822, .combine=rbind, .packages="Matrix") %dopar% {
  rx = rev[rev$bid %in% B$bid[X[,i]],]
  cbind(
    category = colnames(X)[i],
    n = as.integer( table(rx$year) ),
    aggregate(. ~ year, rx[,c(1,3:5,7,10:20)], mean)
    ) }
Sys.time() - t0 # 58.0772 secs
stopCluster(cores)

}
2.2 比較討論聲量相似的商業類別
df = cat822 %>% 
  filter(year >= 2010 & year <= 2016) %>%
  filter(category %in% colnames(X)[20:50]) %>%
  dplyr::select(category, year, negative, stars, useful, n,
         positive, joy, anticipation, trust, surprise,
         sadness, anger, disgust, fear, cool, funny
         )  %>% 
  group_by(year) %>%                  # scale scores by years so they become...
  mutate_at(c(3:17), scale) %>%       #   contemporarily relative  
  mutate(n = 0.1 + n - min(n) ) %>%   # aviod negative bubble size
  data.frame 

p = gvisMotionChart(df, "category", "year")
plot(p)


(3) Japanese 商業類別

3.1 設定商店評論選擇條件

選擇在

  • 2011年以後每年都有評論
  • 2011年以後評論的總數數量大於100、小於300
  • 商業類別包含日本

的商店,將其bid放在japs裡面

japs = rev %>% 
  filter(bid %in% B$bid[X[,"Japanese"]]) %>% 
  filter(year >= 2011) %>% 
  group_by(bid) %>% 
  summarise(                      
    n_year = n_distinct(year),   # 
    n_rev = n()                  # 
    ) %>% 
  filter(
    n_year == 7,
    n_rev > 100 & n_rev < 300
    ) %>% 
  dplyr::select(bid) %>% 
  left_join(B)
Joining, by = "bid"
nrow(japs)
[1] 64
3.2 選出評論

選出這一些商店在2011年之後的所有評論,放在資料框R裡面

R = rev %>% filter(bid %in% japs$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")


(4) 從平台取出評論內容:txt

#########################################################
library(dplyr) 
library(sparklyr)
Sys.setenv(SPARK_HOME="/usr/local/spark/spark-2.1.0-bin-hadoop2.7/")
config <- spark_config()
# config$spark.ui.port = "4044"
config$spark.executor.memory = "4G"
config$spark.driver.memory = "4G" 
config$spark.yarn.executor.memoryOverhead = "4096"
sc <- spark_connect(master = "spark://hnamenode:7077", config = config)

yelp = "hdfs://192.168.1.100:9000/home/tonychuo/yelp10/pq01/"
Text = spark_read_parquet(sc, "Text", paste0(yelp, "Text"))
txt = Text %>% filter(rid %in% R$rid) %>% collect

spark_disconnect(sc)
nrow(txt)
[1] 10925

總共有10,925篇評論

c(setequal(R$rid, txt$rid), identical(R$rid, txt$rid))
[1]  TRUE FALSE

從平台取出的評論次序和R$rid是不同的,所以需要排序一下

txt = left_join(R[,"rid",F], txt)
Joining, by = "rid"
identical(R$rid, txt$rid)
[1] TRUE
從主機取出話題評分
ypath = "/home/tonychuo/_S2018/yelp10/data/" 
load(paste0(ypath, "empath.rdata"))
empath = left_join(R[,"rid",F], empath)
Joining, by = "rid"
identical(R$rid, empath$rid)
[1] TRUE
gc()                                # release memory
            used  (Mb) gc trigger   (Mb)   max used   (Mb)
Ncells   2651651 141.7    4703850  251.3    3886542  207.6
Vcells 101942657 777.8  894107060 6821.5 1025762824 7826.0
par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")


(5) 聲量、星等、互動、情緒

5.1 商店間的動態比較
df = R %>% group_by(bid, year) %>% 
  mutate(engage = useful + cool + funny) %>% 
  summarise_at(
    c("engage", "stars", "positive", "negative"), 
    mean) %>% 
  left_join(count(R, bid, year)) %>% 
  data.frame
Joining, by = c("bid", "year")
p = gvisMotionChart(df, "bid", "year")
plot(p)
5.2 加入店名、去除極端值
df = left_join(df, B[,c("bid","name")])    # merge in biz name
Joining, by = "bid"
df = cbind(df[,-c(1:2)], df[,1:2,F]) %>%   # move bid to the last column
  filter(! bid %in% c(6115, 3406, 30751)) %>%
  data.frame

p = gvisMotionChart(df, "name", "year") 
plot(p)
5.3 星等上升、下降最快的商店

使用線性回歸找出星等上升、下降最快的商店

library(stringr)
bx = sapply(split(df, df$bid), function(x) {
  lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>%  # take lm coef.
  .[ abs(.) > 0.15  ] %>%                              # 
  names %>% str_extract("[0-9]*") %>% 
  as.integer
p = df %>% filter(bid %in% bx) %>% gvisMotionChart("name", "year")
plot(p)


(6) 商店與話題

6.1 商店話題矩陣與熱圖

Make biz-theme matrix - mx

library(d3heatmap)
mx = sapply(split(empath[,-1], R$bid), colMeans) %>%  t    # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames

Select themes with medium-high weights

rx = colSums(mx) %>% quantile(c(0.60, 0.95))          #     
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]]  #

Check the range of weights

par(cex=0.8)
hist(log(mx+1e-4))

6.2 D3heatmap
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=rev(cm.colors(9)))
6.3 熱圖 + 互動式集群分析

這個工具不能投射在網頁上,只能直接在Rstudio裡面做

library(morpheus)
mx %>% {log(.+1e-4)} %>% morpheus


(7) 商店的情緒分佈

將商店投射到尺度縮減之後的情緒平面上

7.1 建立商店情緒矩陣
pcx = sapply(split(R[,10:19], R$bid), colMeans) %>% t
rownames(pcx) = bnames
7.2 尺度縮減
library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

前三個主成分已經涵蓋了90%的變異

7.3 情緒平面上的商店
source("bipcx.R")

N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")
bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")


(8) 文字分析與字雲

8.1 建立字頻表 (文件字詞矩陣)
library(tm)
dtm = txt$text %>% 
  iconv(to = "utf-8", sub="") %>% 
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.998)
dtm  # (documents: 10925, terms: 2520)
<<DocumentTermMatrix (documents: 10925, terms: 2520)>>
Non-/sparse entries: 440830/27090170
Sparsity           : 98%
Maximal term length: 13
Weighting          : term frequency (tf)
8.2 使用TF-IDF篩選字詞
library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0509  0.1198  0.1471  0.1631  0.1835  1.3746 
dtm = dtm[, tfidf > 0.1471 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
[1] 10925  1260
8.3 使用tSNE做尺度縮減
library(Rtsne)
n = 800
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
  scale %>% t %>% Rtsne(
    check_duplicates = FALSE, theta=0.0, max_iter=3200)
8.4 層級式集群分析
Y = tsne$Y              # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 100                 # number of clusters 
g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort         # sizes of clusters
  [1]  2  2  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  5  5  5  5  6
 [32]  6  6  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  9  9  9
 [63]  9  9  9  9  9  9  9 10 10 10 10 10 10 11 11 11 11 11 11 12 12 12 12 12 12 13 13 13 13 13 14
 [94] 14 14 14 15 15 17 20
8.5 文字雲
library(randomcoloR)
library(wordcloud)

wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)

png("fig/japs2.png", width=3200, height=1800)
textplot(
  Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
  col=colors[g],
  cex= 0.2 + 1.25 * sqrt(wc/mean(wc)),
  font=2)
dev.off()
png 
  2 

(9) 商務屬性 Business Attributes

9.1 bizatt 資料框
summary(bizatt)
      bid         AcceptsInsurance  AgesAllowed              Alcohol          BYOB        
 Min.   :     1   Mode :logical    18plus :    86   beer_and_wine:  6444   Mode :logical  
 1st Qu.: 39150   FALSE:2149       19plus :    63   full_bar     : 18767   FALSE:857      
 Median : 78310   TRUE :6426       21plus :   211   none         : 19022   TRUE :48       
 Mean   : 78311   NA's :147684     allages:    35   NA's         :112026   NA's :155354   
 3rd Qu.:117462                    NA's   :155864                                         
 Max.   :156639                                                                           
                                                                                          
      BYOBCorkage     BikeParking     BusinessAcceptsBitcoin BusinessAcceptsCreditCards
 no         :   752   Mode :logical   Mode :logical          Mode :logical             
 yes_corkage:   157   FALSE:16419     FALSE:8663             FALSE:7579                
 yes_free   :   491   TRUE :57064     TRUE :211              TRUE :113145              
 NA's       :154859   NA's :82776     NA's :147385           NA's :35535               
                                                                                       
                                                                                       
                                                                                       
 ByAppointmentOnly   Caters        CoatCheck        Corkage        DogsAllowed     DriveThru      
 Mode :logical     Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:18059       FALSE:17221     FALSE:6887      FALSE:512       FALSE:7642      FALSE:3686     
 TRUE :16714       TRUE :17306     TRUE :1089      TRUE :140       TRUE :3568      TRUE :2352     
 NA's :121486      NA's :121732    NA's :148283    NA's :155607    NA's :145049    NA's :150221   
                                                                                                  
                                                                                                  
                                                                                                  
 GoodForDancing  GoodForKids     HappyHour         HasTV             NoiseLevel     Open24Hours    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   average  : 26119   Mode :logical  
 FALSE:6470      FALSE:11114     FALSE:2478      FALSE:21116     loud     :  3908   FALSE:318      
 TRUE :2062      TRUE :47232     TRUE :5937      TRUE :22601     quiet    :  8947   TRUE :29       
 NA's :147727    NA's :97913     NA's :147844    NA's :112542    very_loud:  1670   NA's :155912   
                                                                 NA's     :115615                  
                                                                                                   
                                                                                                   
 OutdoorSeating  RestaurantsAttire RestaurantsCounterService RestaurantsDelivery
 Mode :logical   casual: 43277     Mode :logical             Mode :logical      
 FALSE:29680     dressy:  1397     FALSE:150                 FALSE:35526        
 TRUE :20619     formal:   125     TRUE :246                 TRUE :11264        
 NA's :105960    NA's  :111460     NA's :155863              NA's :109469       
                                                                                
                                                                                
                                                                                
 RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService
 Mode :logical            Min.   :1              Mode :logical           Mode :logical          
 FALSE:6223               1st Qu.:1              FALSE:28014             FALSE:14318            
 TRUE :43391              Median :2              TRUE :18496             TRUE :25636            
 NA's :106645             Mean   :2              NA's :109749            NA's :116305           
                          3rd Qu.:2                                                             
                          Max.   :4                                                             
                          NA's   :59929                                                         
 RestaurantsTakeOut    Smoking       WheelchairAccessible   WiFi          casual       
 Mode :logical      no     :  3019   Mode :logical        free: 20924   Mode :logical  
 FALSE:5355         outdoor:  3513   FALSE:5815           no  : 21954   FALSE:23947    
 TRUE :49277        yes    :  1140   TRUE :38202          paid:   543   TRUE :18942    
 NA's :101627       NA's   :148587   NA's :112242         NA's:112838   NA's :113370   
                                                                                       
                                                                                       
                                                                                       
   classy          divey          hipster         intimate        romantic        touristy      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:41968     FALSE:26060     FALSE:41903     FALSE:42299     FALSE:42367     FALSE:42658    
 TRUE :921       TRUE :1111      TRUE :963       TRUE :590       TRUE :522       TRUE :231      
 NA's :113370    NA's :129088    NA's :113393    NA's :113370    NA's :113370    NA's :113370   
                                                                                                
                                                                                                
                                                                                                
   trendy         upscale          friday          monday         saturday         sunday       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:40921     FALSE:42492     FALSE:1857      FALSE:5698      FALSE:1726      FALSE:5113     
 TRUE :1968      TRUE :397       TRUE :4534      TRUE :693       TRUE :4665      TRUE :1278     
 NA's :113370    NA's :113370    NA's :149868    NA's :149868    NA's :149868    NA's :149868   
                                                                                                
                                                                                                
                                                                                                
  thursday        tuesday        wednesday         garage           lot            street       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:3841      FALSE:5738      FALSE:5247      FALSE:84470     FALSE:54451     FALSE:72745    
 TRUE :2550      TRUE :653       TRUE :1144      TRUE :4950      TRUE :34969     TRUE :16675    
 NA's :149868    NA's :149868    NA's :149868    NA's :66839     NA's :66839     NA's :66839    
                                                                                                
                                                                                                
                                                                                                
   valet         validated       dairy-free      gluten-free       halal           kosher       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:87761     FALSE:86762     FALSE:231       FALSE:225       FALSE:243       FALSE:245      
 TRUE :1659      TRUE :440       TRUE :18        TRUE :24        TRUE :6         TRUE :4        
 NA's :66839     NA's :69057     NA's :156010    NA's :156010    NA's :156010    NA's :156010   
                                                                                                
                                                                                                
                                                                                                
  soy-free         vegan         vegetarian      breakfast         brunch         dessert       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:243       FALSE:128       FALSE:177       FALSE:39214     FALSE:39388     FALSE:41418    
 TRUE :6         TRUE :121       TRUE :72        TRUE :3726      TRUE :3552      TRUE :1522     
 NA's :156010    NA's :156010    NA's :156010    NA's :113319    NA's :113319    NA's :113319   
                                                                                                
                                                                                                
                                                                                                
   dinner        latenight         lunch         africanamerican   asian          coloring      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:27638     FALSE:40462     FALSE:25109     FALSE:751       FALSE:598       FALSE:279      
 TRUE :15302     TRUE :2478      TRUE :17831     TRUE :423       TRUE :576       TRUE :1126     
 NA's :113319    NA's :113319    NA's :113319    NA's :155085    NA's :155085    NA's :154854   
                                                                                                
                                                                                                
                                                                                                
   curly         extensions         kids           perms         straightperms   background_music
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   
 FALSE:399       FALSE:570       FALSE:693       FALSE:745       FALSE:672       FALSE:6193      
 TRUE :1006      TRUE :835       TRUE :712       TRUE :660       TRUE :502       TRUE :1833      
 NA's :154854    NA's :154854    NA's :154854    NA's :154854    NA's :155085    NA's :148233    
                                                                                                 
                                                                                                 
                                                                                                 
     dj           jukebox         karaoke           live          no_music         video        
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:6312      FALSE:6895      FALSE:7785      FALSE:6621      FALSE:8026      FALSE:7819     
 TRUE :1714      TRUE :1131      TRUE :241       TRUE :1405      NA's :148233    TRUE :207      
 NA's :148233    NA's :148233    NA's :148233    NA's :148233                    NA's :148233   
                                                                                                
                                                                                                
                                                                                                
bizatt$RestaurantsPriceRange2[ X[,"Restaurants"] ] %>% table(useNA="ifany")
.
    1     2     3     4  <NA> 
19013 24917  2845   537  4299 
grep("mex|ita",colnames(X),T, value=T)
 [1] "Italian"                "Mexican"                "Tex-Mex"               
 [4] "Hospitals"              "Vitamins & Supplements" "Rehabilitation Center" 
 [7] "Guitar Stores"          "Meditation Centers"     "Emergency Pet Hospital"
[10] "New Mexican Cuisine"   
italian = X[,"Italian"] & X[,"Restaurants"]
mexican = X[,"Mexican"] & X[,"Restaurants"]
c(sum(italian), sum(mexican), sum(italian & mexican))
[1] 4410 3913   25
bizatt$RestaurantsPriceRange2[ italian ] %>% table(useNA="ifany")
.
   1    2    3    4 <NA> 
 788 2698  519   70  335 
bizatt$RestaurantsPriceRange2[ mexican ] %>% table(useNA="ifany")
.
   1    2    3    4 <NA> 
2286 1388   41    5  193