# install.packages("googleVis")
# install.packages("Rtsne")
# install.packages("d3heatmap")
# install.packages("wordcloud")rm(list=ls(all=T)); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 495661 26.5 940480 50.3 940480 50.3
## Vcells 937659 7.2 1650153 12.6 1158393 8.9
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)
library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)
# options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
# options(width=100, digits=4, scipen=60)# 讀取已經透過YelpExplore處理好的資料
prep2 = "prep2/"
load(paste0(prep2,"businesses.rdata"))
load(paste0(prep2,"reviews.rdata"))rev# 使用cbind()將評論摘要與情緒評分整併
rev = cbind(rev, senti[,-1])
range(rev$date)[1] "2004-07-22" "2017-07-26"
# 以7月作為分界點,將資料按照所屬範圍之年份進行切割
# rev$year = as.integer(format(rev$date, "%Y"))
breaks = as.Date(c("2004-07-22", paste0(2005:2017, "-07-27")))
rev$year = as.integer(cut(rev$date, breaks)) + 2004txt# 計算各間商店(bid)數量後,再藉由降冪排列、找出前100間商店,並取得該商店之bid
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid# 計算前100間商店
df = rev %>% filter(bid %in% b100) %>%
# 必須符合前100間的bid
group_by(bid, year) %>%
# 以bid與年份作為群組依據
summarise(
# 產生摘要資料,將前100間商店的正向、負向、傷心情緒做平均計算
positive = mean(positive),
negative = mean(negative),
sad = mean(sadness),
n = n()
)# 先篩選出2009年後的評論資料、計算單一店家評論數量,再取出bid
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 9) %>% .$bid# 使用Google Motion Chart查看在2010年到2016年之間,使用者評論這些商店表現的狀況如何
df %>% filter(year >= 2010 & year <= 2016 & bid %in% y09) %>%
data.frame %>%
gvisMotionChart("bid", "year") %>% plotstarting httpd help server ... done
# 取得商業類別資料,共822筆
load(paste0(prep2, "cat822.rdata"))df = cat822 %>%
filter(year >= 2010 & year <= 2016) %>%
# 篩選出2010到2016之間的資料
filter(category %in% colnames(X)[1:100]) %>%
# 篩選出1~100的類別資料
dplyr::select(category, year, negative, stars, useful, n,
# 選取類別、年份及評論情緒
positive, joy, anticipation, trust, surprise,
sadness, anger, disgust, fear, cool, funny
) %>%
group_by(year) %>% # scale scores by years so they become...
mutate(n = 0.1 + n - min(n) ) %>% # aviod negative bubble size
data.frame
p = gvisMotionChart(df, "category", "year")
plot(p)依照泡泡圖所示,可以看出在數量與星等的設定下,所有類別走向皆朝右上方移動,可以明顯地看到數量最多的為餐廳還有food等,表示yelp的使用者,對於飲食的重視程度高,yelp中屬於食物相關類別與店家也較多,且現代人對於健康飲食、健身風潮的流行,皆顯示現代人飲食習慣的改變,於是我們決定選擇輕飲食系列作為分析的主要目標。
選擇網美族作為目標對象
選擇在
的商店,將其bid放在Cafes、Breakfast & Brunch、Desserts裡面
# 只要資料包含Cafes、Breakfast & Brunch、Desserts其中一項以上,即做為計算資料
buty =rowSums(X[,c("Cafes", "Breakfast & Brunch", "Desserts")]) > 0
sum(buty)[1] 8386
cf = rev %>%
filter(bid %in% B$bid[buty]) %>%
# 找出評論中屬於這些商店的評論內容
filter(year >= 2011) %>%
# 資料時間在2010年以後
group_by(bid) %>%
summarise(
n_year = n_distinct(year), # 年份總長
n_rev = n() # 評論數量
) %>%
filter(
n_year == 7, # 具有七年的資料
n_rev > 100 & n_rev < 300 # 評論數量在100到300之間
) %>%
dplyr::select(bid) %>%
left_join(B) # 整合商店基本資料Joining, by = "bid"
nrow(cf)[1] 206
選出這一些商店在2011年之後的所有評論,放在資料框R裡面
R = rev %>% filter(bid %in% cf$bid & year >= 2011)
# 找出這些商店在2011年以後的評論
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz") # 以直方圖呈現評論資料txt# 取出評論內容的資料(需透過hdfs方式快速存取)
library(dplyr)
library(sparklyr)
Sys.setenv(SPARK_HOME="/usr/local/spark/spark-2.1.0-bin-hadoop2.7/")
config <- spark_config()
config$spark.ui.port = "4044"
config$spark.executor.memory = "4G"
config$spark.driver.memory = "4G"
config$spark.yarn.executor.memoryOverhead = "4096"
sc <- spark_connect(master = "spark://hnamenode:7077", config = config)
yelp = "hdfs://192.168.1.100:9000/home/tonychuo/yelp10/pq01/"
Text = spark_read_parquet(sc, "Text", paste0(yelp, "Text"))
txt = Text %>% filter(rid %in% R$rid) %>% collect
spark_disconnect(sc)總共有11572篇評論
nrow(txt) # 計算評論總數量[1] 35003
從平台取出的評論次序和R$rid是不同的,所以需要排序一下
c(setequal(R$rid, txt$rid), identical(R$rid, txt$rid))[1] TRUE FALSE
txt = left_join(R[,"rid",F], txt) Joining, by = "rid"
identical(R$rid, txt$rid)[1] TRUE
ypath = "./prep2/"
load(paste0(ypath, "empath.rdata"))
empath = left_join(R[,"rid",F], empath)Joining, by = "rid"
identical(R$rid, empath$rid)[1] TRUE
gc() # release memory used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2678675 143.1 4703850 251.3 3886542 207.6
Vcells 111448601 850.3 830526033 6336.5 1035268734 7898.5
table(rev$year) %>% barplotpar(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year") # 呈現評論數量之直方圖df = R %>% group_by(bid, year) %>%
mutate(engage = useful + cool + funny) %>%
# 增加由useful、cool、funny加總而成的engage
summarise_at(
c("engage", "stars", "positive", "negative"),
# 將這四個欄位進行平均
mean) %>%
left_join(count(R, bid, year)) %>%
data.frameJoining, by = c("bid", "year")
p = gvisMotionChart(df, "bid", "year")
# 動態表現商店在聲量、星等、互動、情緒四者的表現
plot(p)df = left_join(df, B[,c("bid","name")]) # merge in biz nameJoining, by = "bid"
df = cbind(df[,-c(1:2)], df[,1:2,F]) %>% # move bid to the last column
filter(! bid %in% c(4606, 114755, 121795, 25483, 19411, 58860, 84942, 77112, 58467, 154230, 130544, 111963, 97552, 154526)) %>%
# Starbucks Tim Hortons,去除掉重複的bid
data.frame
p = gvisMotionChart(df, "name", "year") # 以動態呈現聲量、星等、互動、情緒表現
plot(p)使用線性回歸找出星等上升、下降最快的商店
library(stringr)
bx = sapply(split(df, df$bid), function(x) {
lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>% # take lm coef.
.[ abs(.) > 0.15 ] %>% # 斜率需大於0.15
names %>% str_extract("[0-9]*") %>%
as.integerp = df %>% filter(bid %in% bx) %>% gvisMotionChart("name", "year")
# 以動態呈現星等表現較為極端的商店
plot(p)Make biz-theme matrix - mx
# biz-theme matrix,產生商業話題矩陣
library(d3heatmap)
mx = sapply(split(empath[,-1], R$bid), colMeans) %>% t bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnamesSelect themes with medium-high weights
# 計算位於0.60~0.95之間資料的欄位加總
rx = colSums(mx) %>% quantile(c(0.60, 0.95))
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]] Check the range of weights
# 查看範圍分布(直方圖)
par(cex=0.8)
hist(log(mx+1e-4)) # 產生商店話題之熱圖
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=rev(cm.colors(9))) # 實體商店分析
# topics = c("attractive","beauty","affection","leisure","feminine","cleaning","healing","hygiene","tra#veling","fun","vacation")
# tmx = sapply(topics, function (k) {
# rownames(mx)[ order(-mx[,k]) ] %>% head(10)
# } )
# write.csv(tmx,file="tmx.csv")
# table(tmx) %>% sort這個工具不能投射在網頁上,只能直接在Rstudio裡面做
library(morpheus)
mx %>% {log(.+1e-4)} %>% morpheus將商店投射到尺度縮減之後的情緒平面上
pcx = sapply(split(R[,10:19], R$bid), colMeans) %>% t
rownames(pcx) = bnameslibrary(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F) 前三個主成分已經涵蓋了90%的變異
# 產生累積差異直方圖
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray') # 三個類別中,所有商店呈現的情緒分布
source("bipcx.R")
N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")library(tm)
dtm = txt$text %>%
iconv(to = "utf-8", sub="") %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>% # 轉換成小寫
tm_map(removePunctuation) %>% # 刪除標點符號
tm_map(removeWords, stopwords("english")) %>% # 刪除贅字
tm_map(stemDocument) %>% # 還原字根
DocumentTermMatrix %>% # 轉換成文字字詞矩陣
removeSparseTerms(0.998) # 篩選掉最後0.2%的資料
dtm # (documents: 10925, terms: 2520)TF-IDF篩選字詞library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)dtm = dtm[, tfidf > 0.14811 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)tSNE做尺度縮減library(Rtsne)
n = 800
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
scale %>% t %>% Rtsne(
check_duplicates = FALSE, theta=0.0, max_iter=3200)Y = tsne$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 100 # number of clusters
g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusterslibrary(randomcoloR)
library(wordcloud)
wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)
fsize = log(wc)/5 # 控制字體大小
range(fsize)
png("fig/cafe.png", width=3200, height=1800) # 產生png擋
textplot(
Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
col=colors[g],
cex= fsize,
font=2)
dev.off()文字雲圖
bizatt 資料框summary(bizatt) bid AcceptsInsurance AgesAllowed Alcohol
Min. : 1 Mode :logical 18plus : 86 beer_and_wine: 6444
1st Qu.: 39150 FALSE:2149 19plus : 63 full_bar : 18767
Median : 78310 TRUE :6426 21plus : 211 none : 19022
Mean : 78311 NA's :147684 allages: 35 NA's :112026
3rd Qu.:117462 NA's :155864
Max. :156639
BYOB BYOBCorkage BikeParking
Mode :logical no : 752 Mode :logical
FALSE:857 yes_corkage: 157 FALSE:16419
TRUE :48 yes_free : 491 TRUE :57064
NA's :155354 NA's :154859 NA's :82776
BusinessAcceptsBitcoin BusinessAcceptsCreditCards ByAppointmentOnly
Mode :logical Mode :logical Mode :logical
FALSE:8663 FALSE:7579 FALSE:18059
TRUE :211 TRUE :113145 TRUE :16714
NA's :147385 NA's :35535 NA's :121486
Caters CoatCheck Corkage DogsAllowed
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:17221 FALSE:6887 FALSE:512 FALSE:7642
TRUE :17306 TRUE :1089 TRUE :140 TRUE :3568
NA's :121732 NA's :148283 NA's :155607 NA's :145049
DriveThru GoodForDancing GoodForKids HappyHour
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:3686 FALSE:6470 FALSE:11114 FALSE:2478
TRUE :2352 TRUE :2062 TRUE :47232 TRUE :5937
NA's :150221 NA's :147727 NA's :97913 NA's :147844
HasTV NoiseLevel Open24Hours OutdoorSeating
Mode :logical average : 26119 Mode :logical Mode :logical
FALSE:21116 loud : 3908 FALSE:318 FALSE:29680
TRUE :22601 quiet : 8947 TRUE :29 TRUE :20619
NA's :112542 very_loud: 1670 NA's :155912 NA's :105960
NA's :115615
RestaurantsAttire RestaurantsCounterService RestaurantsDelivery
casual: 43277 Mode :logical Mode :logical
dressy: 1397 FALSE:150 FALSE:35526
formal: 125 TRUE :246 TRUE :11264
NA's :111460 NA's :155863 NA's :109469
RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations
Mode :logical Min. :1.00 Mode :logical
FALSE:6223 1st Qu.:1.00 FALSE:28014
TRUE :43391 Median :2.00 TRUE :18496
NA's :106645 Mean :1.82 NA's :109749
3rd Qu.:2.00
Max. :4.00
NA's :59929
RestaurantsTableService RestaurantsTakeOut Smoking
Mode :logical Mode :logical no : 3019
FALSE:14318 FALSE:5355 outdoor: 3513
TRUE :25636 TRUE :49277 yes : 1140
NA's :116305 NA's :101627 NA's :148587
WheelchairAccessible WiFi casual classy
Mode :logical free: 20924 Mode :logical Mode :logical
FALSE:5815 no : 21954 FALSE:23947 FALSE:41968
TRUE :38202 paid: 543 TRUE :18942 TRUE :921
NA's :112242 NA's:112838 NA's :113370 NA's :113370
divey hipster intimate romantic
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:26060 FALSE:41903 FALSE:42299 FALSE:42367
TRUE :1111 TRUE :963 TRUE :590 TRUE :522
NA's :129088 NA's :113393 NA's :113370 NA's :113370
touristy trendy upscale friday
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:42658 FALSE:40921 FALSE:42492 FALSE:1857
TRUE :231 TRUE :1968 TRUE :397 TRUE :4534
NA's :113370 NA's :113370 NA's :113370 NA's :149868
monday saturday sunday thursday
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:5698 FALSE:1726 FALSE:5113 FALSE:3841
TRUE :693 TRUE :4665 TRUE :1278 TRUE :2550
NA's :149868 NA's :149868 NA's :149868 NA's :149868
tuesday wednesday garage lot
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:5738 FALSE:5247 FALSE:84470 FALSE:54451
TRUE :653 TRUE :1144 TRUE :4950 TRUE :34969
NA's :149868 NA's :149868 NA's :66839 NA's :66839
street valet validated dairy-free
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:72745 FALSE:87761 FALSE:86762 FALSE:231
TRUE :16675 TRUE :1659 TRUE :440 TRUE :18
NA's :66839 NA's :66839 NA's :69057 NA's :156010
gluten-free halal kosher soy-free
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:225 FALSE:243 FALSE:245 FALSE:243
TRUE :24 TRUE :6 TRUE :4 TRUE :6
NA's :156010 NA's :156010 NA's :156010 NA's :156010
vegan vegetarian breakfast brunch
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:128 FALSE:177 FALSE:39214 FALSE:39388
TRUE :121 TRUE :72 TRUE :3726 TRUE :3552
NA's :156010 NA's :156010 NA's :113319 NA's :113319
dessert dinner latenight lunch
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:41418 FALSE:27638 FALSE:40462 FALSE:25109
TRUE :1522 TRUE :15302 TRUE :2478 TRUE :17831
NA's :113319 NA's :113319 NA's :113319 NA's :113319
africanamerican asian coloring curly
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:751 FALSE:598 FALSE:279 FALSE:399
TRUE :423 TRUE :576 TRUE :1126 TRUE :1006
NA's :155085 NA's :155085 NA's :154854 NA's :154854
extensions kids perms straightperms
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:570 FALSE:693 FALSE:745 FALSE:672
TRUE :835 TRUE :712 TRUE :660 TRUE :502
NA's :154854 NA's :154854 NA's :154854 NA's :155085
background_music dj jukebox karaoke
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:6193 FALSE:6312 FALSE:6895 FALSE:7785
TRUE :1833 TRUE :1714 TRUE :1131 TRUE :241
NA's :148233 NA's :148233 NA's :148233 NA's :148233
live no_music video
Mode :logical Mode :logical Mode :logical
FALSE:6621 FALSE:8026 FALSE:7819
TRUE :1405 NA's :148233 TRUE :207
NA's :148233 NA's :148233
bizatt$RestaurantsPriceRange2[ X[,"Restaurants"] ] %>% table(useNA="ifany").
1 2 3 4 <NA>
19013 24917 2845 537 4299
grep("mex|ita",colnames(X),T, value=T) [1] "Italian" "Mexican"
[3] "Tex-Mex" "Hospitals"
[5] "Vitamins & Supplements" "Rehabilitation Center"
[7] "Guitar Stores" "Meditation Centers"
[9] "Emergency Pet Hospital" "New Mexican Cuisine"
italian = X[,"Italian"] & X[,"Restaurants"]
mexican = X[,"Mexican"] & X[,"Restaurants"]
c(sum(italian), sum(mexican), sum(italian & mexican))[1] 4410 3913 25
bizatt$RestaurantsPriceRange2[ italian ] %>% table(useNA="ifany").
1 2 3 4 <NA>
788 2698 519 70 335
bizatt$RestaurantsPriceRange2[ mexican ] %>% table(useNA="ifany").
1 2 3 4 <NA>
2286 1388 41 5 193
從分析流程說明:
txt從(1)、(2)、(3)可以發現,這是一個決策過程中最關鍵的部分,同時我們也有嘗試有系統地、有邏輯的確認問題(從泡泡圖的趨勢),但純粹的好奇心或過往的經驗依舊主導著問題的選擇,於是加入了主觀意識(飲食習慣改變)型塑出我們的類別。
從我們所選的類別(咖啡廳、早午餐、甜點)都是屬於輕食,所以我們將他們合在一起做比較。而從(4)中的2017年評論比2016來的少,也是值得注意的,往年都是逐年遞增,為何2017反而減少(房市?景氣?),這部分也是我們通過大量的數據來描述發生了什麼事。
接著我們嘗試透過聲量、星等、互動、情緒視覺化圖表來分析,(6)、(7)則更進一步的探討,並希望可以從而剝離問題的根本原因,透過圖表診斷為什麼會發生。例如我們在做完選出星等上升、下降最快的商店後,發現Copper Whisk Café的星等與評論數在2016年時急遽下降,這樣的分析即可幫助我們去搜尋資料,探究其原因,也許是換人經營等等原因。
從此分析中可得知,咖啡廳、早午餐、甜點評論持續增加(除了2017),店家千百種,也可從中找出店家優等生成為標竿,分析其成為優等生之原因,為其他店家做參考與指標。