產業類別趨勢分析 商店總趨勢 主成分分析 文字雲


rm(list=ls(all=T)); gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 489484 26.2     940480 50.3   592000 31.7
## Vcells 914906  7.0    1650153 12.6  1079204  8.3
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)

library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)

options(width=100, gvis.plot.tag='chart', digits=4, scipen=60) #網頁
#options(width=100, digits=4, scipen=60) #本機
Load Data
prep2 = "/home/tonychuo/_S2018/yelp10/prep2/"
load(paste0(prep2,"businesses.rdata"))
load(paste0(prep2,"reviews.rdata"))
Merge sentiment score into rev
rev = cbind(rev, senti[,-1])
rev$year =  as.integer(format(rev$date, "%Y"))
range(rev$date)
[1] "2004-07-22" "2017-07-26"

(1) 評論數量最多的100個商店

1.1 Top100 bid
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid
#每個bid出現多少次
1.2 Summarise reviews by bid and year
df = rev %>% filter(bid %in% b100) %>% 
  group_by(bid, year) %>% 
  summarise(
    positive = mean(positive),
    negative = mean(negative),
    sad = mean(sadness),
    n = n()
    )
#把bid在100名的人都過濾出來,group by bid,year(每個商店每一年都有一筆資料)去算他們正面跟負面情緒
#該商店該年正面、負面情緒平均值
1.3 Biz that have review every year after 2009
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 9) %>% .$bid
#找2009年後,每一年都有評論的
1.4 Google Motion Chart
df %>% filter(year >= 2010 & year <= 2016 & bid %in% y09) %>% 
  data.frame %>% 
  gvisMotionChart("bid", "year") %>% plot


(2) 商業類別的動態比較

2.1 依商業類別彙總評論資料
LOAD = TRUE
if(LOAD) { 
  load(paste0(prep2, "cat822.rdata")) #商業類別的summary
} else {

library(doParallel)
detectCores()
cores <- makeCluster(16)
registerDoParallel(cores)
getDoParWorkers()
t0 = Sys.time()
cat822 = foreach(i=1:822, .combine=rbind, .packages="Matrix") %dopar% {
  rx = rev[rev$bid %in% B$bid[X[,i]],]
  cbind(
    category = colnames(X)[i],
    n = as.integer( table(rx$year) ),
    aggregate(. ~ year, rx[,c(1,3:5,7,10:20)], mean)
    ) }
Sys.time() - t0 # 58.0772 secs
stopCluster(cores)

}


2.2 比較討論聲量相似的商業類別
df = cat822 %>% 
  filter(year >= 2010 & year <= 2017) %>%
  filter(category %in% colnames(X)[2:50]) %>%
  dplyr::select(category, year, negative, stars, useful, n,
         positive, joy, anticipation, trust, surprise,
         sadness, anger, disgust, fear, cool, funny
         )  %>% 
  #group_by(year) %>%                  # scale scores by years so they become...
  #mutate_at(c(3:17), scale) %>%       #   contemporarily relative  
  #mutate(n = 0.1 + n - min(n) ) %>%   # aviod negative bubble size
  data.frame 

p = gvisMotionChart(df, "category", "year")
plot(p)


scale後

df = cat822 %>% 
  filter(year >= 2010 & year <= 2017) %>%
  filter(category %in% colnames(X)[2:50]) %>%
  dplyr::select(category, year, negative, stars, useful, n,
         positive, joy, anticipation, trust, surprise,
         sadness, anger, disgust, fear, cool, funny
         )  %>% 
  group_by(year) %>%                  # scale scores by years so they become...
  mutate_at(c(3:17), scale) %>%       #   contemporarily relative
  mutate(n = 0.1 + n - min(n) ) %>%   # aviod negative bubble size
  data.frame 

p = gvisMotionChart(df, "category", "year")
plot(p)

(3) Education 商業類別

3.1 設定商店評論選擇條件

選擇在

  • 2011年以後每年都有評論
  • 2011年以後評論的總數數量大於100、小於300
  • 商業類別包含Education

的商店,將其bid放在japs裡面

japs = rev %>% 
  filter(bid %in% B$bid[X[,"Education"]]) %>% 
  filter(year >= 2011) %>% 
  group_by(bid) %>% 
  summarise(                      
    n_year = n_distinct(year),   # 
    n_rev = n()                  # 
    ) %>% 
  filter(
    n_year == 7,
    n_rev > 100 & n_rev < 300
    ) %>% 
  dplyr::select(bid) %>% 
  left_join(B)
Joining, by = "bid"
nrow(japs)
[1] 53
3.2 選出評論

選出這一些商店在2011年之後的所有評論,放在資料框R裡面

R = rev %>% filter( bid %in% japs$bid  & year >= 2011)

par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")


(4) 從平台取出評論內容:txt

#########################################################
library(dplyr) 
library(sparklyr)
Sys.setenv(SPARK_HOME="/usr/local/spark/spark-2.1.0-bin-hadoop2.7/")
config <- spark_config()
# config$spark.ui.port = "4044"
config$spark.executor.memory = "4G"
config$spark.driver.memory = "4G" 
config$spark.yarn.executor.memoryOverhead = "4096"
sc <- spark_connect(master = "spark://hnamenode:7077", config = config)

yelp = "hdfs://192.168.1.100:9000/home/tonychuo/yelp10/pq01/"
Text = spark_read_parquet(sc, "Text", paste0(yelp, "Text"))

txt = Text %>% filter(rid %in% R$rid ) %>% collect
R %>% head(20)
   cool       date funny stars useful   rid nchar  bid    uid anger anticipation disgust fear joy
1     0 2016-01-25     0     4      0 85121   508 3974 482077     1            3       1    0   1
2     0 2013-04-24     1     1      2 85122  1751 3974 671420     0            1       0    0   0
3     1 2011-08-26     1     4      2 85123  1437 3974 587413     1            0       1    0   1
4     0 2015-12-10     0     5      0 85124   673 3974 624335     0            1       0    0   5
5     0 2016-03-15     0     5      0 85125   439 3974 652688     1            3       0    1   4
6     0 2017-06-10     0     1      0 85126   180 3974 545739     1            7       1    1   4
7     0 2017-05-23     0     5      0 85127   327 3974  46160     1            3       1    1   4
8     0 2015-12-31     0     5      1 85128   280 3974 481641     0            1       0    0   0
9     0 2015-02-16     0     2      0 85129   809 3974 288898     3            3       3    6   5
10    0 2015-12-22     0     5      0 85130   100 3974 697318     2            5       4    2   4
11    0 2016-09-02     2     4      0 85131   224 3974  83488     2            5       2    4   3
12    0 2017-06-15     0     2      0 85132   769 3974 873791     1            5       0    2   3
13    0 2017-04-24     0     5      0 85133  1281 3974  53675     5            7       8    6   6
14    0 2015-03-30     0     2      0 85134   199 3974 291842     0            0       0    0   1
15    0 2015-02-19     1     1      1 85135   630 3974 482561     2            5       0    3   2
16    0 2014-06-26     0     5      0 85136   504 3974 314858     0            5       0    1   3
17    0 2017-04-11     3     1      2 85137   587 3974 375066     0            1       0    1   0
18    0 2012-02-29     0     1      1 85138   948 3974 676795     2            2       1    2   2
19    0 2012-07-29     0     2      0 85139  1132 3974  55741     3            5       2    1   3
20    0 2016-07-31     0     5      0 85140   321 3974 295252     5            8       3    7   8
   sadness surprise trust negative positive year
1        2        0     1        2        2 2016
2        0        0     0        2        0 2013
3        2        1     1        3        2 2011
4        0        2     4        1        6 2015
5        0        3     4        1        5 2016
6        1        2     4        2        7 2017
7        3        0     8        3       11 2017
8        1        0     1        1        3 2015
9        4        1     4        8        8 2015
10       3        2     5        7        6 2015
11       3        0     2        9        7 2016
12       1        1     3        4        5 2017
13       6        4     8       18       18 2017
14       0        0     0        0        2 2015
15       1        4     4        3        8 2015
16       1        1     4        4       10 2014
17       0        0     1        1        2 2017
18       0        2     2        6        5 2012
19       2        1     4        5        4 2012
20       3        3     9       10       12 2016
spark_disconnect(sc)
nrow(txt)
[1] 9346

總共有10,925篇評論

c(setequal(R$rid, txt$rid), identical(R$rid, txt$rid))
[1]  TRUE FALSE

從平台取出的評論次序和R$rid是不同的,所以需要排序一下

txt = left_join(R[,"rid",F], txt)
Joining, by = "rid"
identical(R$rid, txt$rid)
[1] TRUE
從主機取出話題評分
ypath = "/home/tonychuo/_S2018/yelp10/data/" 
load(paste0(ypath, "empath.rdata"))
empath = left_join(R[,"rid",F], empath)
Joining, by = "rid"
identical(R$rid, empath$rid)
[1] TRUE
gc()                                # release memory
            used  (Mb) gc trigger   (Mb)   max used   (Mb)
Ncells   2652381 141.7    4703850  251.3    3886542  207.6
Vcells 101539812 774.7  894015376 6820.8 1025360305 7822.9
par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")



(5) 聲量、星等、互動、情緒

5.1 商店間的動態比較
df = R %>% group_by(bid, year) %>% 
  mutate(engage = useful + cool + funny) %>% 
  summarise_at(
    c("engage", "stars", "positive", "negative"), 
    mean) %>% 
  left_join(count(R, bid, year)) %>% 
  data.frame
Joining, by = c("bid", "year")
p = gvisMotionChart(df, "bid", "year")
plot(p)

5.2 加入店名、去除極端值
df = left_join(df, B[,c("bid","name")])    # merge in biz name
Joining, by = "bid"
df = cbind(df[,-c(1:2)], df[,1:2,F]) %>%   # move bid to the last column
#  filter(! bid %in% c(6115, 3406, 30751)) %>%
  data.frame

df$name_bid=paste0(df$name,df$bid)
p = gvisMotionChart(df, "name_bid", "year") 
plot(p)
5.3 星等上升、下降最快的商店

使用線性回歸找出星等上升、下降最快的商店

library(stringr)
bx = sapply(split(df, df$bid), function(x) {
  lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>%  # take lm coef.
  .[ abs(.) > 0.15  ] %>%                              # 
  names %>% str_extract("[0-9]*") %>% 
  as.integer

#把星等對年份做回歸,星等上升、下降最快代表迴歸係數
p = df %>% filter(bid %in% bx) %>% gvisMotionChart("name", "year")
plot(p)


(6) 商店與話題

6.1 商店話題矩陣與熱圖

Make biz-theme matrix - mx

library(d3heatmap)
empath %>% head
    rid help   office violence dance    money  wedding valuable domestic_work    sleep
1 85121    0 0.000000        0     0 0.028571 0.000000 0.000000      0.000000 0.000000
2 85122    0 0.000000        0     0 0.000000 0.000000 0.000000      0.000000 0.000000
3 85123    0 0.000000        0     0 0.000000 0.000000 0.000000      0.000000 0.000000
4 85124    0 0.008929        0     0 0.008929 0.000000 0.008929      0.008929 0.000000
5 85125    0 0.000000        0     0 0.000000 0.004425 0.000000      0.017699 0.000000
6 85126    0 0.015625        0     0 0.007813 0.015625 0.007813      0.000000 0.007813
  medical_emergency cold hate cheerfulness aggression occupation envy anticipation   family crime
1                 0    0    0     0.000000          0   0.000000    0            0 0.000000     0
2                 0    0    0     0.000000          0   0.000000    0            0 0.000000     0
3                 0    0    0     0.000000          0   0.000000    0            0 0.000000     0
4                 0    0    0     0.000000          0   0.000000    0            0 0.008929     0
5                 0    0    0     0.004425          0   0.000000    0            0 0.004425     0
6                 0    0    0     0.000000          0   0.007813    0            0 0.007813     0
  attractive masculine prison health pride dispute nervousness government weakness horror
1          0         0      0      0     0       0           0          0        0      0
2          0         0      0      0     0       0           0          0        0      0
3          0         0      0      0     0       0           0          0        0      0
4          0         0      0      0     0       0           0          0        0      0
5          0         0      0      0     0       0           0          0        0      0
6          0         0      0      0     0       0           0          0        0      0
  swearing_terms leisure suffering  royalty wealthy white_collar_job  tourism furniture school
1              0       0         0 0.000000       0                0 0.000000  0.000000      0
2              0       0         0 0.000000       0                0 0.000000  0.000000      0
3              0       0         0 0.000000       0                0 0.000000  0.000000      0
4              0       0         0 0.000000       0                0 0.000000  0.000000      0
5              0       0         0 0.004425       0                0 0.000000  0.000000      0
6              0       0         0 0.000000       0                0 0.007813  0.007813      0
  magic beach journalism  morning banking social_media exercise    night kill art play computer
1     0     0          0 0.014286       0     0.028571        0 0.000000    0   0    0 0.000000
2     0     0          0 0.000000       0     0.000000        0 0.000000    0   0    0 0.000000
3     0     0          0 0.000000       0     0.000000        0 0.000000    0   0    0 0.000000
4     0     0          0 0.017857       0     0.008929        0 0.000000    0   0    0 0.008929
5     0     0          0 0.000000       0     0.000000        0 0.004425    0   0    0 0.000000
6     0     0          0 0.007813       0     0.000000        0 0.000000    0   0    0 0.000000
  college traveling stealing real_estate    home divine sexual fear monster irritability superhero
1       0  0.000000        0      0.0000 0.00000      0      0    0       0            0         0
2       0  0.000000        0      0.0000 0.00000      0      0    0       0            0         0
3       0  0.000000        0      0.0000 0.03030      0      0    0       0            0         0
4       0  0.008929        0      0.0000 0.00000      0      0    0       0            0         0
5       0  0.008850        0      0.0177 0.02212      0      0    0       0            0         0
6       0  0.015625        0      0.0000 0.00000      0      0    0       0            0         0
  business  driving pet childish cooking exasperation religion hipster internet surprise reading
1 0.000000 0.000000   0 0.000000 0.02857            0        0       0 0.000000        0       0
2 0.000000 0.000000   0 0.000000 0.03226            0        0       0 0.000000        0       0
3 0.000000 0.000000   0 0.000000 0.03030            0        0       0 0.000000        0       0
4 0.000000 0.008929   0 0.000000 0.04464            0        0       0 0.008929        0       0
5 0.000000 0.000000   0 0.004425 0.02212            0        0       0 0.000000        0       0
6 0.007813 0.000000   0 0.007813 0.02344            0        0       0 0.000000        0       0
  worship leader independence movement body    noise  eating medieval zest confusion water sports
1       0      0            0        0    0 0.000000 0.02857        0    0   0.01429     0      0
2       0      0            0        0    0 0.000000 0.03226        0    0   0.00000     0      0
3       0      0            0        0    0 0.000000 0.03030        0    0   0.00000     0      0
4       0      0            0        0    0 0.017857 0.03571        0    0   0.00000     0      0
5       0      0            0        0    0 0.004425 0.05310        0    0   0.00000     0      0
6       0      0            0        0    0 0.000000 0.01562        0    0   0.00000     0      0
  death healing legend heroic celebration restaurant ridicule programming dominant_heirarchical
1     0       0      0      0    0.000000    0.02857        0    0.014286                     0
2     0       0      0      0    0.000000    0.03226        0    0.000000                     0
3     0       0      0      0    0.060606    0.03030        0    0.000000                     0
4     0       0      0      0    0.008929    0.01786        0    0.008929                     0
5     0       0      0      0    0.004425    0.02212        0    0.000000                     0
6     0       0      0      0    0.000000    0.03125        0    0.000000                     0
  military neglect swimming exotic    love hiking communication  hearing    order sympathy hygiene
1        0       0        0      0 0.00000      0      0.000000 0.000000 0.000000        0       0
2        0       0        0      0 0.00000      0      0.000000 0.000000 0.000000        0       0
3        0       0        0      0 0.03030      0      0.000000 0.000000 0.000000        0       0
4        0       0        0      0 0.00000      0      0.000000 0.017857 0.000000        0       0
5        0       0        0      0 0.00885      0      0.000000 0.004425 0.004425        0       0
6        0       0        0      0 0.00000      0      0.007813 0.000000 0.000000        0       0
  weather anonymity    trust ancient deception fabric air_travel fight dominant_personality music
1       0         0 0.014286       0         0      0   0.000000     0                    0     0
2       0         0 0.000000       0         0      0   0.000000     0                    0     0
3       0         0 0.000000       0         0      0   0.000000     0                    0     0
4       0         0 0.000000       0         0      0   0.000000     0                    0     0
5       0         0 0.000000       0         0      0   0.004425     0                    0     0
6       0         0 0.007813       0         0      0   0.007813     0                    0     0
  vehicle politeness toy farming meeting war speaking   listen urban shopping disgust fire     tool
1       0   0.000000   0       0       0   0 0.000000 0.000000     0 0.028571       0    0 0.000000
2       0   0.000000   0       0       0   0 0.000000 0.000000     0 0.000000       0    0 0.000000
3       0   0.000000   0       0       0   0 0.000000 0.000000     0 0.000000       0    0 0.000000
4       0   0.000000   0       0       0   0 0.008929 0.008929     0 0.008929       0    0 0.008929
5       0   0.000000   0       0       0   0 0.008850 0.004425     0 0.004425       0    0 0.000000
6       0   0.007813   0       0       0   0 0.007813 0.000000     0 0.015625       0    0 0.000000
  phone gain    sound injury sailing rage  science     work appearance optimism warmth youth
1     0    0 0.000000      0       0    0 0.000000 0.014286          0  0.00000      0     0
2     0    0 0.000000      0       0    0 0.000000 0.000000          0  0.00000      0     0
3     0    0 0.000000      0       0    0 0.000000 0.000000          0  0.00000      0     0
4     0    0 0.017857      0       0    0 0.008929 0.008929          0  0.00000      0     0
5     0    0 0.004425      0       0    0 0.000000 0.000000          0  0.01327      0     0
6     0    0 0.000000      0       0    0 0.000000 0.007813          0  0.00000      0     0
  sadness fun emotional joy affection fashion lust shame torment economics anger politics ship
1       0   0    0.0000   0   0.00000       0    0     0       0         0     0        0    0
2       0   0    0.0000   0   0.00000       0    0     0       0         0     0        0    0
3       0   0    0.0303   0   0.00000       0    0     0       0         0     0        0    0
4       0   0    0.0000   0   0.00000       0    0     0       0         0     0        0    0
5       0   0    0.0000   0   0.00885       0    0     0       0         0     0        0    0
6       0   0    0.0000   0   0.00000       0    0     0       0         0     0        0    0
  clothing car strength technology breaking shape_and_size power vacation  animal ugliness   party
1        0   0  0.00000   0.014286        0       0.000000     0  0.00000 0.00000        0 0.00000
2        0   0  0.00000   0.000000        0       0.000000     0  0.00000 0.03226        0 0.00000
3        0   0  0.00000   0.000000        0       0.000000     0  0.00000 0.00000        0 0.03030
4        0   0  0.00000   0.008929        0       0.008929     0  0.01786 0.00000        0 0.00000
5        0   0  0.00000   0.000000        0       0.000000     0  0.00885 0.01327        0 0.02212
6        0   0  0.01562   0.000000        0       0.007813     0  0.01562 0.00000        0 0.00000
  terrorism    smell blue_collar_job     poor plant pain  beauty timidity philosophy negotiate
1         0 0.000000        0.000000 0.000000     0    0 0.01429        0          0         0
2         0 0.000000        0.000000 0.000000     0    0 0.00000        0          0         0
3         0 0.000000        0.000000 0.000000     0    0 0.00000        0          0         0
4         0 0.008929        0.008929 0.000000     0    0 0.00000        0          0         0
5         0 0.004425        0.000000 0.000000     0    0 0.00000        0          0         0
6         0 0.000000        0.000000 0.007813     0    0 0.00000        0          0         0
  negative_emotion cleaning messaging competing law  friends  payment achievement  alcohol
1         0.000000        0  0.000000         0   0 0.014286 0.028571           0 0.000000
2         0.000000        0  0.000000         0   0 0.000000 0.000000           0 0.000000
3         0.030303        0  0.000000         0   0 0.000000 0.000000           0 0.000000
4         0.008929        0  0.008929         0   0 0.008929 0.000000           0 0.008929
5         0.000000        0  0.000000         0   0 0.013274 0.000000           0 0.004425
6         0.007813        0  0.000000         0   0 0.015625 0.007813           0 0.007813
  disappointment   liquid feminine weapon children    ocean   giving contentment writing rural
1              0 0.000000        0      0  0.00000 0.000000 0.014286           0       0     0
2              0 0.000000        0      0  0.00000 0.032258 0.000000           0       0     0
3              0 0.000000        0      0  0.00000 0.000000 0.030303           0       0     0
4              0 0.008929        0      0  0.00000 0.000000 0.000000           0       0     0
5              0 0.000000        0      0  0.00000 0.004425 0.004425           0       0     0
6              0 0.000000        0      0  0.01562 0.000000 0.007813           0       0     0
  positive_emotion musical
1          0.00000       0
2          0.00000       0
3          0.00000       0
4          0.00000       0
5          0.01327       0
6          0.00000       0
mx = sapply(split(empath[,-1], R$bid), colMeans) %>% t   # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames

Select themes with medium-high weights 取出權重值在60%~95%之間的Theme

rx = colSums(mx) %>% quantile(c(0.60, 0.95))          #     
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]]  #

Check the range of weights

par(cex=0.8)
hist(log(mx+1e-4))

6.2 D3heatmap
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=cm.colors(9))
6.3 熱圖 + 互動式集群分析

這個工具不能投射在網頁上,只能直接在Rstudio裡面做

library(morpheus)
mx %>% {log(.+1e-4)} %>% morpheus


(7) 商店的情緒分佈

將商店投射到尺度縮減之後的情緒平面上


7.1 建立商店情緒矩陣
pcx = sapply(split(R[,10:19], R$bid), colMeans) %>% t
rownames(pcx) = bnames
7.2 尺度縮減
library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

前三個主成分已經涵蓋了90%的變異

7.3 情緒平面上的商店
source("bipcx.R")

N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")
bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
      obs='Business', main="Strength & Valence of Sentiment")



(8) 文字分析與字雲

8.1 建立字頻表 (文件字詞矩陣)
library(tm)
dtm = txt$text %>% 
  iconv(to = "utf-8", sub="") %>% 
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(removeWords, stopwords("english")) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.998)
dtm  # (documents: 10925, terms: 2520)
<<DocumentTermMatrix (documents: 9346, terms: 2536)>>
Non-/sparse entries: 387348/23314108
Sparsity           : 98%
Maximal term length: 13
Weighting          : term frequency (tf)
8.2 使用TF-IDF篩選字詞
library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0482  0.1155  0.1433  0.1532  0.1777  0.9287 
dtm = dtm[, tfidf > 0.1471 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
[1] 9346 1176
8.3 使用tSNE做尺度縮減
library(Rtsne)
n = 800
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
  scale %>% t %>% Rtsne(
    check_duplicates = FALSE, theta=0.0, max_iter=3200)
8.4 層級式集群分析
Y = tsne$Y              # tSNE coordinates
d = dist(Y)             # distance matrix
hc = hclust(d)          # hi-clustering
K = 100                 # number of clusters 
g = cutree(hc,K)        # cut into K clusters
table(g) %>% as.vector %>% sort         # sizes of clusters
  [1]  2  2  2  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  5  5  5  5  5  5  5  5  6  6  6  6  6
 [32]  6  6  6  6  6  6  6  6  6  7  7  7  7  7  7  7  7  8  8  8  8  8  8  8  8  8  8  8  8  8  8
 [63]  8  9  9  9  9  9  9  9  9  9 10 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12 13 13 13 14 14
 [94] 14 15 16 16 18 18 22
8.5 文字雲
library(randomcoloR)
library(wordcloud)

wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)

png("fig/pet.png", width=3200, height=1800)
textplot(
  Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
  col=colors[g],
  cex= 0.2 + 1.25 * sqrt(wc/mean(wc)),
  font=2)
dev.off()
png 
  2 

(9) 商務屬性 Business Attributes

9.1 bizatt 資料框
summary(bizatt)
      bid         AcceptsInsurance  AgesAllowed              Alcohol          BYOB        
 Min.   :     1   Mode :logical    18plus :    86   beer_and_wine:  6444   Mode :logical  
 1st Qu.: 39150   FALSE:2149       19plus :    63   full_bar     : 18767   FALSE:857      
 Median : 78310   TRUE :6426       21plus :   211   none         : 19022   TRUE :48       
 Mean   : 78311   NA's :147684     allages:    35   NA's         :112026   NA's :155354   
 3rd Qu.:117462                    NA's   :155864                                         
 Max.   :156639                                                                           
                                                                                          
      BYOBCorkage     BikeParking     BusinessAcceptsBitcoin BusinessAcceptsCreditCards
 no         :   752   Mode :logical   Mode :logical          Mode :logical             
 yes_corkage:   157   FALSE:16419     FALSE:8663             FALSE:7579                
 yes_free   :   491   TRUE :57064     TRUE :211              TRUE :113145              
 NA's       :154859   NA's :82776     NA's :147385           NA's :35535               
                                                                                       
                                                                                       
                                                                                       
 ByAppointmentOnly   Caters        CoatCheck        Corkage        DogsAllowed     DriveThru      
 Mode :logical     Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:18059       FALSE:17221     FALSE:6887      FALSE:512       FALSE:7642      FALSE:3686     
 TRUE :16714       TRUE :17306     TRUE :1089      TRUE :140       TRUE :3568      TRUE :2352     
 NA's :121486      NA's :121732    NA's :148283    NA's :155607    NA's :145049    NA's :150221   
                                                                                                  
                                                                                                  
                                                                                                  
 GoodForDancing  GoodForKids     HappyHour         HasTV             NoiseLevel     Open24Hours    
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   average  : 26119   Mode :logical  
 FALSE:6470      FALSE:11114     FALSE:2478      FALSE:21116     loud     :  3908   FALSE:318      
 TRUE :2062      TRUE :47232     TRUE :5937      TRUE :22601     quiet    :  8947   TRUE :29       
 NA's :147727    NA's :97913     NA's :147844    NA's :112542    very_loud:  1670   NA's :155912   
                                                                 NA's     :115615                  
                                                                                                   
                                                                                                   
 OutdoorSeating  RestaurantsAttire RestaurantsCounterService RestaurantsDelivery
 Mode :logical   casual: 43277     Mode :logical             Mode :logical      
 FALSE:29680     dressy:  1397     FALSE:150                 FALSE:35526        
 TRUE :20619     formal:   125     TRUE :246                 TRUE :11264        
 NA's :105960    NA's  :111460     NA's :155863              NA's :109469       
                                                                                
                                                                                
                                                                                
 RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService
 Mode :logical            Min.   :1              Mode :logical           Mode :logical          
 FALSE:6223               1st Qu.:1              FALSE:28014             FALSE:14318            
 TRUE :43391              Median :2              TRUE :18496             TRUE :25636            
 NA's :106645             Mean   :2              NA's :109749            NA's :116305           
                          3rd Qu.:2                                                             
                          Max.   :4                                                             
                          NA's   :59929                                                         
 RestaurantsTakeOut    Smoking       WheelchairAccessible   WiFi          casual       
 Mode :logical      no     :  3019   Mode :logical        free: 20924   Mode :logical  
 FALSE:5355         outdoor:  3513   FALSE:5815           no  : 21954   FALSE:23947    
 TRUE :49277        yes    :  1140   TRUE :38202          paid:   543   TRUE :18942    
 NA's :101627       NA's   :148587   NA's :112242         NA's:112838   NA's :113370   
                                                                                       
                                                                                       
                                                                                       
   classy          divey          hipster         intimate        romantic        touristy      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:41968     FALSE:26060     FALSE:41903     FALSE:42299     FALSE:42367     FALSE:42658    
 TRUE :921       TRUE :1111      TRUE :963       TRUE :590       TRUE :522       TRUE :231      
 NA's :113370    NA's :129088    NA's :113393    NA's :113370    NA's :113370    NA's :113370   
                                                                                                
                                                                                                
                                                                                                
   trendy         upscale          friday          monday         saturday         sunday       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:40921     FALSE:42492     FALSE:1857      FALSE:5698      FALSE:1726      FALSE:5113     
 TRUE :1968      TRUE :397       TRUE :4534      TRUE :693       TRUE :4665      TRUE :1278     
 NA's :113370    NA's :113370    NA's :149868    NA's :149868    NA's :149868    NA's :149868   
                                                                                                
                                                                                                
                                                                                                
  thursday        tuesday        wednesday         garage           lot            street       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:3841      FALSE:5738      FALSE:5247      FALSE:84470     FALSE:54451     FALSE:72745    
 TRUE :2550      TRUE :653       TRUE :1144      TRUE :4950      TRUE :34969     TRUE :16675    
 NA's :149868    NA's :149868    NA's :149868    NA's :66839     NA's :66839     NA's :66839    
                                                                                                
                                                                                                
                                                                                                
   valet         validated       dairy-free      gluten-free       halal           kosher       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:87761     FALSE:86762     FALSE:231       FALSE:225       FALSE:243       FALSE:245      
 TRUE :1659      TRUE :440       TRUE :18        TRUE :24        TRUE :6         TRUE :4        
 NA's :66839     NA's :69057     NA's :156010    NA's :156010    NA's :156010    NA's :156010   
                                                                                                
                                                                                                
                                                                                                
  soy-free         vegan         vegetarian      breakfast         brunch         dessert       
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:243       FALSE:128       FALSE:177       FALSE:39214     FALSE:39388     FALSE:41418    
 TRUE :6         TRUE :121       TRUE :72        TRUE :3726      TRUE :3552      TRUE :1522     
 NA's :156010    NA's :156010    NA's :156010    NA's :113319    NA's :113319    NA's :113319   
                                                                                                
                                                                                                
                                                                                                
   dinner        latenight         lunch         africanamerican   asian          coloring      
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:27638     FALSE:40462     FALSE:25109     FALSE:751       FALSE:598       FALSE:279      
 TRUE :15302     TRUE :2478      TRUE :17831     TRUE :423       TRUE :576       TRUE :1126     
 NA's :113319    NA's :113319    NA's :113319    NA's :155085    NA's :155085    NA's :154854   
                                                                                                
                                                                                                
                                                                                                
   curly         extensions         kids           perms         straightperms   background_music
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   
 FALSE:399       FALSE:570       FALSE:693       FALSE:745       FALSE:672       FALSE:6193      
 TRUE :1006      TRUE :835       TRUE :712       TRUE :660       TRUE :502       TRUE :1833      
 NA's :154854    NA's :154854    NA's :154854    NA's :154854    NA's :155085    NA's :148233    
                                                                                                 
                                                                                                 
                                                                                                 
     dj           jukebox         karaoke           live          no_music         video        
 Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical   Mode :logical  
 FALSE:6312      FALSE:6895      FALSE:7785      FALSE:6621      FALSE:8026      FALSE:7819     
 TRUE :1714      TRUE :1131      TRUE :241       TRUE :1405      NA's :148233    TRUE :207      
 NA's :148233    NA's :148233    NA's :148233    NA's :148233                    NA's :148233   
                                                                                                
                                                                                                
                                                                                                
bizatt$RestaurantsPriceRange2[ X[,"Restaurants"] ] %>% table(useNA="ifany")
.
    1     2     3     4  <NA> 
19013 24917  2845   537  4299 
grep("mex|ita",colnames(X),T, value=T)
 [1] "Italian"                "Mexican"                "Tex-Mex"               
 [4] "Hospitals"              "Vitamins & Supplements" "Rehabilitation Center" 
 [7] "Guitar Stores"          "Meditation Centers"     "Emergency Pet Hospital"
[10] "New Mexican Cuisine"   
italian = X[,"Italian"] & X[,"Restaurants"]
mexican = X[,"Mexican"] & X[,"Restaurants"]
c(sum(italian), sum(mexican), sum(italian & mexican))
[1] 4410 3913   25
bizatt$RestaurantsPriceRange2[ italian ] %>% table(useNA="ifany")
.
   1    2    3    4 <NA> 
 788 2698  519   70  335 
bizatt$RestaurantsPriceRange2[ mexican ] %>% table(useNA="ifany")
.
   1    2    3    4 <NA> 
2286 1388   41    5  193