rm(list=ls(all=T)); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1504013 80.4 2637877 140.9 2637877 140.9
Vcells 3090031 23.6 175745030 1340.9 133302492 1017.1
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(googleVis)
Creating a generic function for ‘toJSON’ from package ‘jsonlite’ in package ‘googleVis’
Welcome to googleVis version 0.6.2
Please read Google's Terms of Use
before you start using the package:
https://developers.google.com/terms/
Note, the plot method of googleVis will by default use
the standard browser to display its output.
See the googleVis package vignettes for more details,
or visit http://github.com/mages/googleVis.
To suppress this message use:
suppressPackageStartupMessages(library(googleVis))
library(doParallel)
Loading required package: foreach
foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com
Loading required package: iterators
Loading required package: parallel
library(Rtsne)
library(wordcloud)
Loading required package: RColorBrewer
library(RColorBrewer)
library(randomcoloR)
library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(highcharter)
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
library(slam)
library(MASS)
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
#options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
options(width=100, digits=4, scipen=60)
prep2 = "/home/tonychuo/_S2018/yelp10/prep2/"
load(paste0(prep2,"businesses.rdata"))
load(paste0(prep2,"reviews.rdata"))
revrev = cbind(rev, senti[,-1])
breaks = as.Date(c("2004-07-22", paste0(2005:2017, "-07-27")))
rev$year = as.integer(cut(rev$date, breaks)) + 2004
range(rev$date)
[1] "2004-07-22" "2017-07-26"
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid
df = rev %>% filter(bid %in% b100) %>%
group_by(bid, year) %>%
summarise(
positive = mean(positive),
negative = mean(negative),
sad = mean(sadness),
n = n()
)
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(nn == 9) %>% .$bid
df %>% filter(year >= 2010 & year <= 2016 & bid %in% y09) %>%
data.frame %>%
gvisMotionChart("bid", "year") %>% plot
LOAD = TRUE
if(LOAD) {
load(paste0(prep2, "cat822.rdata"))
} else {
library(doParallel)
detectCores()
cores <- makeCluster(16)
registerDoParallel(cores)
getDoParWorkers()
t0 = Sys.time()
cat822 = foreach(i=1:822, .combine=rbind, .packages="Matrix") %dopar% {
rx = rev[rev$bid %in% B$bid[X[,i]],]
cbind(
category = colnames(X)[i],
n = as.integer( table(rx$year) ),
aggregate(. ~ year, rx[,c(1,3:5,7,10:20)], mean)
) }
Sys.time() - t0 # 58.0772 secs
stopCluster(cores)
}
ourGroup <- c("Bakeries","Desserts","Ice Cream & Frozen Yogurt")
df = cat822 %>%
filter(year >= 2010 & year <= 2016) %>%
filter(category %in% ourGroup) %>%
dplyr::select(category, year, negative, stars, useful, n,
positive, joy, anticipation, trust, surprise,
sadness, anger, disgust, fear, cool, funny
) %>%
group_by(year) %>% # scale scores by years so they become...
# mutate_at(c(3:17), scale) %>% # contemporarily relative
# mutate(n = 0.1 + n - min(n) ) %>% # aviod negative bubble size
data.frame
p = gvisMotionChart(df, "category", "year")
plot(p)
選擇在
的商店,將其bid放在japs裡面
japs = rev %>%
filter(bid %in% B$bid[X[,"Bakeries"]]) %>%
filter(year >= 2011) %>%
group_by(bid) %>%
summarise(
n_year = n_distinct(year), #
n_rev = n() #
) %>%
filter(
n_year == 7,
n_rev > 100
) %>%
dplyr::select(bid) %>%
left_join(B)
Joining, by = "bid"
nrow(japs)
[1] 91
選出這一些商店在2011年之後的所有評論,放在資料框R裡面
R = rev %>% filter(bid %in% japs$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")
txt#########################################################
library(dplyr)
library(sparklyr)
Sys.setenv(SPARK_HOME="/usr/local/spark/spark-2.1.0-bin-hadoop2.7/")
config <- spark_config()
# config$spark.ui.port = "4044"
config$spark.executor.memory = "4G"
config$spark.driver.memory = "4G"
config$spark.yarn.executor.memoryOverhead = "4096"
sc <- spark_connect(master = "spark://hnamenode:7077", config = config)
yelp = "hdfs://192.168.1.100:9000/home/tonychuo/yelp10/pq01/"
Text = spark_read_parquet(sc, "Text", paste0(yelp, "Text"))
txt = Text %>% filter(rid %in% R$rid) %>% collect
spark_disconnect(sc)
nrow(txt)
[1] 21968
總共有21968篇評論
c(setequal(R$rid, txt$rid), identical(R$rid, txt$rid))
[1] TRUE FALSE
從平台取出的評論次序和R$rid是不同的,所以需要排序一下
txt = left_join(R[,"rid",F], txt)
Joining, by = "rid"
identical(R$rid, txt$rid)
[1] TRUE
ypath = "/home/tonychuo/_S2018/yelp10/data/"
load(paste0(ypath, "empath.rdata"))
empath = left_join(R[,"rid",F], empath)
Joining, by = "rid"
identical(R$rid, empath$rid)
[1] TRUE
gc() # release memory
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 2729804 145.8 4703850 251.3 3886542 207.6
Vcells 107509834 820.3 938316084 7158.8 1031300313 7868.2
par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")
df = R %>% group_by(bid, year) %>%
mutate(engage = useful + cool + funny) %>%
summarise_at(
c("engage", "stars", "positive", "negative"),
mean) %>%
left_join(count(R, bid, year)) %>%
data.frame
Joining, by = c(“bid”, “year”)
p = gvisMotionChart(df, "bid", "year")
plot(p)
df = left_join(df, B[,c("bid","name")]) # merge in biz name
Joining, by = c(“name”, “bid”)
df = cbind(df[,-c(1:2)], df[,1:2,F]) %>% # move bid to the last column
filter(! bid %in% c(6115, 3406, 30751)) %>%
data.frame
df$name=paste0(df$name,df$bid)
p = gvisMotionChart(df, "name", "year")
plot(p)
使用線性回歸找出星等上升、下降最快的商店
library(stringr)
bx = sapply(split(df, df$bid), function(x) {
lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>% # take lm coef.
.[ abs(.) > 0.15 ] %>% #
names %>% str_extract("[0-9]*") %>%
as.integer
p = df %>% filter(bid %in% bx) %>% gvisMotionChart("name", "year")
plot(p)
Make biz-theme matrix - mx
library(d3heatmap)
mx = sapply(split(empath[,-1], R$bid), colMeans) %>% t # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames
Select themes with medium-high weights
rx = colSums(mx) %>% quantile(c(0.60, 0.95)) #
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]] #
Check the range of weights
par(cex=0.8)
hist(log(mx+1e-4))
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=rev(cm.colors(9)))
這個工具不能投射在網頁上,只能直接在Rstudio裡面做
library(morpheus)
mx %>% {log(.+1e-4)} %>% morpheus
將商店投射到尺度縮減之後的情緒平面上
pcx = sapply(split(R[,10:19], R$bid), colMeans) %>% t
rownames(pcx) = bnames
library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F)
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')
前三個主成分已經涵蓋了90%的變異
source("bipcx.R")
N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")
bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")
library(tm)
dtm = txt$text %>%
iconv(to = "utf-8", sub="") %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix %>%
removeSparseTerms(0.998)
dtm # (documents: 10925, terms: 2520)
<<DocumentTermMatrix (documents: 21968, terms: 2502)>>
Non-/sparse entries: 896315/54067621
Sparsity : 98%
Maximal term length: 13
Weighting : term frequency (tf)
TF-IDF篩選字詞library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0503 0.1203 0.1475 0.1586 0.1786 0.9654
dtm = dtm[, tfidf > 0.1471 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
[1] 21968 1265
tSNE做尺度縮減library(Rtsne)
n = 800
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
scale %>% t %>% Rtsne(
check_duplicates = FALSE, theta=0.0, max_iter=3200)
Y = tsne$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 100 # number of clusters
g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters
[1] 2 2 2 2 2 2 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5
[31] 5 5 5 6 6 6 6 6 6 6 6 7 7 7 7 8 8 8 8 8 8 8 9 9 9 9 9 9 9 9
[61] 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 11 11 11 11 11 11 11 11 12 12 12 12 12 12 12
[91] 13 13 13 15 16 16 16 17 17 19
library(randomcoloR)
library(wordcloud)
wc = col_sums(dtm[,1:n])
colors = distinctColorPalette(K)
png("fig/japs2.png", width=3200, height=1800)
textplot(
Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
col=colors[g],
cex= 0.2 + 1.25 * sqrt(wc/mean(wc)),
font=2)
dev.off()
null device
1
bizatt 資料框summary(bizatt)
bid AcceptsInsurance AgesAllowed Alcohol BYOB
Min. : 1 Mode :logical 18plus : 86 beer_and_wine: 6444 Mode :logical
1st Qu.: 39150 FALSE:2149 19plus : 63 full_bar : 18767 FALSE:857
Median : 78310 TRUE :6426 21plus : 211 none : 19022 TRUE :48
Mean : 78311 NA's :147684 allages: 35 NA's :112026 NA's :155354
3rd Qu.:117462 NA's :155864
Max. :156639
BYOBCorkage BikeParking BusinessAcceptsBitcoin BusinessAcceptsCreditCards
no : 752 Mode :logical Mode :logical Mode :logical
yes_corkage: 157 FALSE:16419 FALSE:8663 FALSE:7579
yes_free : 491 TRUE :57064 TRUE :211 TRUE :113145
NA's :154859 NA's :82776 NA's :147385 NA's :35535
ByAppointmentOnly Caters CoatCheck Corkage DogsAllowed
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:18059 FALSE:17221 FALSE:6887 FALSE:512 FALSE:7642
TRUE :16714 TRUE :17306 TRUE :1089 TRUE :140 TRUE :3568
NA's :121486 NA's :121732 NA's :148283 NA's :155607 NA's :145049
DriveThru GoodForDancing GoodForKids HappyHour HasTV
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:3686 FALSE:6470 FALSE:11114 FALSE:2478 FALSE:21116
TRUE :2352 TRUE :2062 TRUE :47232 TRUE :5937 TRUE :22601
NA's :150221 NA's :147727 NA's :97913 NA's :147844 NA's :112542
NoiseLevel Open24Hours OutdoorSeating RestaurantsAttire RestaurantsCounterService
average : 26119 Mode :logical Mode :logical casual: 43277 Mode :logical
loud : 3908 FALSE:318 FALSE:29680 dressy: 1397 FALSE:150
quiet : 8947 TRUE :29 TRUE :20619 formal: 125 TRUE :246
very_loud: 1670 NA's :155912 NA's :105960 NA's :111460 NA's :155863
NA's :115615
RestaurantsDelivery RestaurantsGoodForGroups RestaurantsPriceRange2 RestaurantsReservations
Mode :logical Mode :logical Min. :1 Mode :logical
FALSE:35526 FALSE:6223 1st Qu.:1 FALSE:28014
TRUE :11264 TRUE :43391 Median :2 TRUE :18496
NA's :109469 NA's :106645 Mean :2 NA's :109749
3rd Qu.:2
Max. :4
NA's :59929
RestaurantsTableService RestaurantsTakeOut Smoking WheelchairAccessible WiFi
Mode :logical Mode :logical no : 3019 Mode :logical free: 20924
FALSE:14318 FALSE:5355 outdoor: 3513 FALSE:5815 no : 21954
TRUE :25636 TRUE :49277 yes : 1140 TRUE :38202 paid: 543
NA's :116305 NA's :101627 NA's :148587 NA's :112242 NA's:112838
casual classy divey hipster intimate
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:23947 FALSE:41968 FALSE:26060 FALSE:41903 FALSE:42299
TRUE :18942 TRUE :921 TRUE :1111 TRUE :963 TRUE :590
NA's :113370 NA's :113370 NA's :129088 NA's :113393 NA's :113370
romantic touristy trendy upscale friday
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:42367 FALSE:42658 FALSE:40921 FALSE:42492 FALSE:1857
TRUE :522 TRUE :231 TRUE :1968 TRUE :397 TRUE :4534
NA's :113370 NA's :113370 NA's :113370 NA's :113370 NA's :149868
monday saturday sunday thursday tuesday
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:5698 FALSE:1726 FALSE:5113 FALSE:3841 FALSE:5738
TRUE :693 TRUE :4665 TRUE :1278 TRUE :2550 TRUE :653
NA's :149868 NA's :149868 NA's :149868 NA's :149868 NA's :149868
wednesday garage lot street valet
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:5247 FALSE:84470 FALSE:54451 FALSE:72745 FALSE:87761
TRUE :1144 TRUE :4950 TRUE :34969 TRUE :16675 TRUE :1659
NA's :149868 NA's :66839 NA's :66839 NA's :66839 NA's :66839
validated dairy-free gluten-free halal kosher
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:86762 FALSE:231 FALSE:225 FALSE:243 FALSE:245
TRUE :440 TRUE :18 TRUE :24 TRUE :6 TRUE :4
NA's :69057 NA's :156010 NA's :156010 NA's :156010 NA's :156010
soy-free vegan vegetarian breakfast brunch
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:243 FALSE:128 FALSE:177 FALSE:39214 FALSE:39388
TRUE :6 TRUE :121 TRUE :72 TRUE :3726 TRUE :3552
NA's :156010 NA's :156010 NA's :156010 NA's :113319 NA's :113319
dessert dinner latenight lunch africanamerican
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:41418 FALSE:27638 FALSE:40462 FALSE:25109 FALSE:751
TRUE :1522 TRUE :15302 TRUE :2478 TRUE :17831 TRUE :423
NA's :113319 NA's :113319 NA's :113319 NA's :113319 NA's :155085
asian coloring curly extensions kids
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:598 FALSE:279 FALSE:399 FALSE:570 FALSE:693
TRUE :576 TRUE :1126 TRUE :1006 TRUE :835 TRUE :712
NA's :155085 NA's :154854 NA's :154854 NA's :154854 NA's :154854
perms straightperms background_music dj jukebox
Mode :logical Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:745 FALSE:672 FALSE:6193 FALSE:6312 FALSE:6895
TRUE :660 TRUE :502 TRUE :1833 TRUE :1714 TRUE :1131
NA's :154854 NA's :155085 NA's :148233 NA's :148233 NA's :148233
karaoke live no_music video
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:7785 FALSE:6621 FALSE:8026 FALSE:7819
TRUE :241 TRUE :1405 NA's :148233 TRUE :207
NA's :148233 NA's :148233 NA's :148233
bizatt$RestaurantsPriceRange2[ X[,"Restaurants"] ] %>% table(useNA="ifany")
.
1 2 3 4 <NA>
19013 24917 2845 537 4299
grep("mex|ita",colnames(X),T, value=T)
[1] "Italian" "Mexican" "Tex-Mex"
[4] "Hospitals" "Vitamins & Supplements" "Rehabilitation Center"
[7] "Guitar Stores" "Meditation Centers" "Emergency Pet Hospital"
[10] "New Mexican Cuisine"
italian = X[,"Italian"] & X[,"Restaurants"]
mexican = X[,"Mexican"] & X[,"Restaurants"]
c(sum(italian), sum(mexican), sum(italian & mexican))
[1] 4410 3913 25
bizatt$RestaurantsPriceRange2[ italian ] %>% table(useNA="ifany")
.
1 2 3 4 <NA>
788 2698 519 70 335
bizatt$RestaurantsPriceRange2[ mexican ] %>% table(useNA="ifany")
.
1 2 3 4 <NA>
2286 1388 41 5 193