rm(list=ls(all=T)); gc()## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 490275 26.2 940480 50.3 592000 31.7
## Vcells 916122 7.0 1650153 12.6 1081161 8.3
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)
library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)
options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
# options(width=100, digits=4, scipen=60)prep2 = "/home/tonychuo/_S2018/yelp10/prep2/"
load(paste0(prep2,"businesses.rdata"))
load(paste0(prep2,"reviews.rdata"))revrev = cbind(rev, senti[,-1])
range(rev$date)[1] "2004-07-22" "2017-07-26"
我改用用年中來切年份,這樣可以避免最後一年評論數下降,提升圖表的連續性,也比較容易觀察趨勢,大家可以在2.2的動態圖表裡面,清楚的看到整個目標族群在變數空間裡面移動的效果。
#rev$year = as.integer(format(rev$date, "%Y"))
breaks = as.Date(c("2004-07-22", paste0(2005:2017, "-07-27")))
rev$year = as.integer(cut(rev$date, breaks)) + 2004b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$biddf = rev %>% filter(bid %in% b100) %>%
group_by(bid, year) %>%
summarise(
positive = mean(positive),
negative = mean(negative),
sad = mean(sadness),
n = n()
)y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 9) %>% .$biddf %>% filter(year >= 2010 & bid %in% y09) %>%
data.frame %>%
gvisMotionChart("bid", "year") %>% plotLOAD = TRUE
if(LOAD) {
load(paste0(prep2, "cat822.rdata"))
} else {
library(doParallel)
detectCores()
cores <- makeCluster(16)
registerDoParallel(cores)
getDoParWorkers()
t0 = Sys.time()
cat822 = foreach(i=1:822, .combine=rbind, .packages="Matrix") %dopar% {
rx = rev[rev$bid %in% B$bid[X[,i]],]
cbind(
category = colnames(X)[i],
n = as.integer( table(rx$year) ),
aggregate(. ~ year, rx[,c(1,3:5,7,10:20)], mean)
) }
Sys.time() - t0 # 58.0772 secs
stopCluster(cores)
}df = cat822 %>%
filter(year >= 2010) %>%
filter(category %in% colnames(X)[20:50]) %>%
dplyr::select(category, year, negative, stars, useful, n,
positive, joy, anticipation, trust, surprise,
sadness, anger, disgust, fear, cool, funny
) %>%
#group_by(year) %>% # scale scores by years so they
#mutate_at(c(3:17), scale) %>% # are contemporarily relative
#mutate(n = 0.1 + n - min(n) ) %>% # aviod negative bubble size
data.frame
p = gvisMotionChart(df, "category", "year")
plot(p)選擇在
的商店,將其bid放在japs裡面
japs = rev %>%
filter(bid %in% B$bid[X[,"Japanese"]]) %>%
filter(year >= 2011) %>%
group_by(bid) %>%
summarise(
n_year = n_distinct(year), #
n_rev = n() #
) %>%
filter(
n_year == 7,
n_rev > 100 & n_rev < 300
) %>%
dplyr::select(bid) %>%
left_join(B)Joining, by = "bid"
nrow(japs)[1] 65
選出這一些商店在2011年之後的所有評論,放在資料框R裡面
R = rev %>% filter(bid %in% japs$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")txt#########################################################
library(dplyr)
library(sparklyr)
Sys.setenv(SPARK_HOME="/usr/local/spark/spark-2.1.0-bin-hadoop2.7/")
config <- spark_config()
# config$spark.ui.port = "4044"
config$spark.executor.memory = "4G"
config$spark.driver.memory = "4G"
config$spark.yarn.executor.memoryOverhead = "4096"
sc <- spark_connect(master = "spark://hnamenode:7077", config = config)
yelp = "hdfs://192.168.1.100:9000/home/tonychuo/yelp10/pq01/"
Text = spark_read_parquet(sc, "Text", paste0(yelp, "Text"))
txt = Text %>% filter(rid %in% R$rid) %>% collect
spark_disconnect(sc)nrow(txt)[1] 10866
總共有10,925篇評論
c(setequal(R$rid, txt$rid), identical(R$rid, txt$rid))[1] TRUE FALSE
從平台取出的評論次序和R$rid是不同的,所以需要排序一下
txt = left_join(R[,"rid",F], txt)Joining, by = "rid"
identical(R$rid, txt$rid)[1] TRUE
ypath = "/home/tonychuo/_S2018/yelp10/data/"
load(paste0(ypath, "empath.rdata"))
empath = left_join(R[,"rid",F], empath)Joining, by = "rid"
identical(R$rid, empath$rid)[1] TRUE
gc() # release memory used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2654681 141.8 4703850 251.3 3886542 207.6
Vcells 104317347 795.9 891850480 6804.3 1028137623 7844.1
par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")df = R %>% group_by(bid, year) %>%
mutate(engage = useful + cool + funny) %>%
summarise_at(
c("engage", "stars", "positive", "negative"),
mean) %>%
left_join(count(R, bid, year)) %>%
data.frameJoining, by = c("bid", "year")
p = gvisMotionChart(df, "bid", "year")
plot(p)df = left_join(df, B[,c("bid","name")]) # merge in biz nameJoining, by = "bid"
df = cbind(df[,-c(1:2)], df[,1:2,F]) %>% # move bid to the last column
filter(! bid %in% c(6115, 3406, 30751)) %>%
data.frame
p = gvisMotionChart(df, "name", "year")
plot(p)使用線性回歸找出星等上升、下降最快的商店
library(stringr)
bx = sapply(split(df, df$bid), function(x) {
lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>% # take lm coef.
.[ abs(.) > 0.15 ] %>% #
names %>% str_extract("[0-9]*") %>%
as.integerp = df %>% filter(bid %in% bx) %>% gvisMotionChart("name", "year")
plot(p)Make biz-theme matrix - mx
library(d3heatmap)
mx = sapply(split(empath[,-1], R$bid), colMeans) %>% t # biz-theme matrixbnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnamesSelect themes with medium-high weights
rx = colSums(mx) %>% quantile(c(0.60, 0.95)) #
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]] #Check the range of weights
par(cex=0.8)
hist(log(mx+1e-4))mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=rev(cm.colors(9)))這個工具不能投射在網頁上,只能直接在Rstudio裡面做
library(morpheus)
mx %>% {log(.+1e-4)} %>% morpheus將商店投射到尺度縮減之後的情緒平面上
pcx = sapply(split(R[,10:19], R$bid), colMeans) %>% t
rownames(pcx) = bnameslibrary(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F) par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray') 前三個主成分已經涵蓋了90%的變異
source("bipcx.R")
N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")library(tm)
dtm = txt$text %>%
iconv(to = "utf-8", sub="") %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix %>%
removeSparseTerms(0.998)
dtm # (documents: 10925, terms: 2520)<<DocumentTermMatrix (documents: 10866, terms: 2535)>>
Non-/sparse entries: 442136/27103174
Sparsity : 98%
Maximal term length: 13
Weighting : term frequency (tf)
TF-IDF篩選字詞library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0505 0.1189 0.1466 0.1616 0.1834 1.3136
dtm = dtm[, tfidf > 0.1471 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)[1] 10866 1255
tSNE做尺度縮減library(Rtsne)
n = 800
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
scale %>% t %>% Rtsne(
check_duplicates = FALSE, theta=0.0, max_iter=3200)Y = tsne$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 100 # number of clusters
g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters [1] 2 2 2 2 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5 5
[32] 5 5 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8
[63] 8 8 8 9 9 9 10 10 10 10 11 11 11 11 11 11 11 11 11 12 12 12 12 12 13 13 13 14 14 14 15
[94] 15 15 15 16 17 18 22
library(randomcoloR)
library(wordcloud)
wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)
png("fig/japs2.png", width=3200, height=1800)
textplot(
Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
col=colors[g],
cex= 0.2 + 1.25 * sqrt(wc/mean(wc)),
font=2)
dev.off()png
2
bizatt 資料框summary(bizatt) bid AcceptsInsurance AgesAllowed Alcohol BYOB
Min. : 1 Mode :logical 18plus : 86 beer_and_wine: 6444 Mode :logical
1st Qu.: 39150 FALSE:2149 19plus : 63 full_bar : 18767 FALSE:857
Median : 78310 TRUE :6426 21plus : 211 none : 19022 TRUE :48
Mean : 78311 NA's :147684 allages: 35 NA's :112026 NA's :155354
3rd Qu.:117462 NA's :155864
Max. :156639
BYOBCorkage BikeParking BusinessAcceptsBitcoin BusinessAcceptsCreditCards
no : 752 Mode :logical Mode :logical Mode :logical
yes_corkage: 157 FALSE:16419 FALSE:8663 FALSE:7579
yes_free : 491 TRUE :57064 TRUE :211 TRUE :113145
NA's :154859 NA's :82776 NA's :147385 NA's :35535
ByAppointmentOnly Caters CoatCheck Corkage DogsAllowed DriveThru
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:18059 FALSE:17221 FALSE:6887 FALSE:512 FALSE:7642 FALSE:3686
TRUE :16714 TRUE :17306 TRUE :1089 TRUE :140 TRUE :3568 TRUE :2352
NA's :121486 NA's :121732 NA's :148283 NA's :155607 NA's :145049 NA's :150221
GoodForDancing GoodForKids HappyHour HasTV NoiseLevel Open24Hours
Mode :logical Mode :logical Mode :logical Mode :logical average : 26119 Mode :logical
FALSE:6470 FALSE:11114 FALSE:2478 FALSE:21116 loud : 3908 FALSE:318
TRUE :2062 TRUE :47232 TRUE :5937 TRUE :22601 quiet : 8947 TRUE :29
NA's :147727 NA's :97913 NA's :147844 NA's :112542 very_loud: 1670 NA's :155912
NA's :115615
OutdoorSeating RestaurantsAttire RestaurantsCounterService RestaurantsDelivery
Mode :logical casual: 43277 Mode :logical Mode :logical
FALSE:29680 dressy: 1397 FALSE:150 FALSE:35526
TRUE :20619 formal: 125 TRUE :246 TRUE :11264
NA's :105960 NA's :111460 NA's :155863 NA's :109469
RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations RestaurantsTableService
Mode :logical Min. :1 Mode :logical Mode :logical
FALSE:6223 1st Qu.:1 FALSE:28014 FALSE:14318
TRUE :43391 Median :2 TRUE :18496 TRUE :25636
NA's :106645 Mean :2 NA's :109749 NA's :116305
3rd Qu.:2
Max. :4
NA's :59929
RestaurantsTakeOut Smoking WheelchairAccessible WiFi casual
Mode :logical no : 3019 Mode :logical free: 20924 Mode :logical
FALSE:5355 outdoor: 3513 FALSE:5815 no : 21954 FALSE:23947
TRUE :49277 yes : 1140 TRUE :38202 paid: 543 TRUE :18942
NA's :101627 NA's :148587 NA's :112242 NA's:112838 NA's :113370
classy divey hipster intimate romantic touristy
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:41968 FALSE:26060 FALSE:41903 FALSE:42299 FALSE:42367 FALSE:42658
TRUE :921 TRUE :1111 TRUE :963 TRUE :590 TRUE :522 TRUE :231
NA's :113370 NA's :129088 NA's :113393 NA's :113370 NA's :113370 NA's :113370
trendy upscale friday monday saturday sunday
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:40921 FALSE:42492 FALSE:1857 FALSE:5698 FALSE:1726 FALSE:5113
TRUE :1968 TRUE :397 TRUE :4534 TRUE :693 TRUE :4665 TRUE :1278
NA's :113370 NA's :113370 NA's :149868 NA's :149868 NA's :149868 NA's :149868
thursday tuesday wednesday garage lot street
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:3841 FALSE:5738 FALSE:5247 FALSE:84470 FALSE:54451 FALSE:72745
TRUE :2550 TRUE :653 TRUE :1144 TRUE :4950 TRUE :34969 TRUE :16675
NA's :149868 NA's :149868 NA's :149868 NA's :66839 NA's :66839 NA's :66839
valet validated dairy-free gluten-free halal kosher
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:87761 FALSE:86762 FALSE:231 FALSE:225 FALSE:243 FALSE:245
TRUE :1659 TRUE :440 TRUE :18 TRUE :24 TRUE :6 TRUE :4
NA's :66839 NA's :69057 NA's :156010 NA's :156010 NA's :156010 NA's :156010
soy-free vegan vegetarian breakfast brunch dessert
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:243 FALSE:128 FALSE:177 FALSE:39214 FALSE:39388 FALSE:41418
TRUE :6 TRUE :121 TRUE :72 TRUE :3726 TRUE :3552 TRUE :1522
NA's :156010 NA's :156010 NA's :156010 NA's :113319 NA's :113319 NA's :113319
dinner latenight lunch africanamerican asian coloring
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:27638 FALSE:40462 FALSE:25109 FALSE:751 FALSE:598 FALSE:279
TRUE :15302 TRUE :2478 TRUE :17831 TRUE :423 TRUE :576 TRUE :1126
NA's :113319 NA's :113319 NA's :113319 NA's :155085 NA's :155085 NA's :154854
curly extensions kids perms straightperms background_music
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:399 FALSE:570 FALSE:693 FALSE:745 FALSE:672 FALSE:6193
TRUE :1006 TRUE :835 TRUE :712 TRUE :660 TRUE :502 TRUE :1833
NA's :154854 NA's :154854 NA's :154854 NA's :154854 NA's :155085 NA's :148233
dj jukebox karaoke live no_music video
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:6312 FALSE:6895 FALSE:7785 FALSE:6621 FALSE:8026 FALSE:7819
TRUE :1714 TRUE :1131 TRUE :241 TRUE :1405 NA's :148233 TRUE :207
NA's :148233 NA's :148233 NA's :148233 NA's :148233 NA's :148233
bizatt$RestaurantsPriceRange2[ X[,"Restaurants"] ] %>% table(useNA="ifany").
1 2 3 4 <NA>
19013 24917 2845 537 4299
grep("mex|ita",colnames(X),T, value=T) [1] "Italian" "Mexican" "Tex-Mex"
[4] "Hospitals" "Vitamins & Supplements" "Rehabilitation Center"
[7] "Guitar Stores" "Meditation Centers" "Emergency Pet Hospital"
[10] "New Mexican Cuisine"
italian = X[,"Italian"] & X[,"Restaurants"]
mexican = X[,"Mexican"] & X[,"Restaurants"]
c(sum(italian), sum(mexican), sum(italian & mexican))[1] 4410 3913 25
bizatt$RestaurantsPriceRange2[ italian ] %>% table(useNA="ifany").
1 2 3 4 <NA>
788 2698 519 70 335
bizatt$RestaurantsPriceRange2[ mexican ] %>% table(useNA="ifany").
1 2 3 4 <NA>
2286 1388 41 5 193