這一段程式可以讓同學們在動態網頁上觀察:
- 某一種(群)內容 (194 classes),
- 某一個(群)字詞,或
- 某一個(群)字根
- 某一種情緒
對於評論的:
的效果。
透過互動式的圖形, 我們也可以觀察這些效果是否會隨商業類別(508 categories)發生變化。
Sys.setlocale('LC_ALL','C')
[1] “C”
library(magrittr)
library(highcharter)
library(slam)
library(tm)
package ‘tm’ was built under R version 3.4.1Loading required package: NLP
library(SnowballC)
library(RColorBrewer)
# set color palette
pals = c(brewer.pal(8,"Set2")[c(6)],
brewer.pal(8,"Dark2"),
brewer.pal(8,"Set1")[c(1)])
# load data
load('data/yelp1.rdata')
load('data/average.rdata')
load('data/empath.rdata')
load('data/dtm0.rdata')
(1) Preparation
To aviod glitches, we only analyze the business categories with more than 500 reviews.
First we do a quick hi-clustering on these (123) categories by their normolized class scores.
# clusters of categories by classes scores
d = dist(scale(wx[CA$nrev>500,]))
hc = hclust(d,"ward.D2")
Cut them into 10 groups.
kg = cutree(hc,10)
kg = order(order(-table(kg)))[kg]
table(kg)
kg
1 2 3 4 5 6 7 8 9 10
44 23 17 10 8 6 5 4 3 3
Name the groups and combine the group_id with the category profiles.
gnames = c('Restaurant','Bar', 'Shop', 'Venue', 'Service',
'Art', 'Beauty', 'Fitness', 'Health', 'Public')
cats = cbind( cat=rownames(CA)[CA$nrev>500],
grp=factor(kg,labels=gnames),
CA[CA$nrev>500,] )
head(cats)
(2) Helper Function
Make a helper function for interactive charting …
# helper function
make.chart = function(bygroup=F) { # x,X,y,Y,mxBCm=,review
# overall usage and lift
N = as.numeric(length(X))
txt = sprintf("OVERALL: Usage=%.3f, Conf=%.3f, Base=%.3f, Lift=%.3f",
sum(X)/N, sum(X&Y)/sum(X), sum(Y)/N,
N*sum(Y&X)/sum(X)/sum(Y))
if(bygroup) {
mx = sapply(levels(cats$grp), function(g)
rowSums(mxBC[, as.character(cats$cat[cats$grp == g])]) > 0 )
bubbles = list(maxSize="20%",minSize=20)
df2 = aggregate(.~grp,cats[,2:4],sum) }
else {
mx = mxBC[,CA$nrev>500]
bubbles = list(maxSize="10%",minSize=10)
df2 = cats[,1:4] }
# cases
df = mx %>% apply(2,function(v) {
i = review$bid %in% rownames(mxBC)[v]
n = as.numeric(sum(i))
c( usage = sum(X[i])/n,
base = round(sum(Y[i])/n, 3),
conf = round(sum(X[i]&Y[i])/sum(X[i]), 3),
lift = n * sum(Y[i]&X[i]) / sum(X[i]) / sum(Y[i]) )
}) %>% t %>% data.frame %>% cbind(df2)
# bubble chart
tips=paste0(ifelse(bygroup, "", "<b>{point.cat}</b><br>"),
"no.rev/biz: {point.nrev} / {point.nbiz}<br>",
"conf/base: {point.conf} / {point.base}")
hchart(df,"scatter",hcaes(x=usage,y=lift,size=nrev,group=grp)) %>%
hc_title(text=sprintf("The effect of <b>%s</b> on <b>%s</b>",x,y)) %>%
hc_subtitle(text = txt) %>% hc_colors(pals) %>%
hc_tooltip(hideDelay=100,useHTML=T,pointFormat=tips) %>%
hc_plotOptions(bubble = bubbles) %>% hc_size(height=640) %>%
hc_chart(zoomType="xy") %>% hc_add_theme(hc_theme_flat())
}
(3) Explore the Effect of Content Classes
3.1 某一種內容
y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "swearing_terms"; X = scores[, x] %>% {. > median(.)}
make.chart()
For the entire corpus and each business categories, we calculate and display:
Usage = \(P[X]\) : the base probability of \(X\)
Conf = \(P[Y|X]\) : the probability of \(Y\) given \(X\)
Base = \(P[Y]\) : the base probability of \(Y\)
Lift = \(\frac{P[Y|X]}{P[Y]}\) : the lift of \(X\) on \(P[Y]\)
3.2 某一群內容
可以一次看一群內容,也可以將整群商業類別集合起來一起看 …
y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "sexual+lust"
X = rowSums(sapply(c('sexual','lust'), # <-- 將內容放在括弧裡面
function(s) scores[,s] > median(scores[,s]))) > 0
make.chart(bygroup=TRUE)
3.3 某一群(個)字根
y = "cool"; Y = review[, y] %>% {. > median(.)}
# 將字根放在括弧裡面
terms = grep("^(authen|genuin|pure|innoc|origin|true|truth)",
dtm$dimnames$Terms[1:10000], value=F)
x = "authenticity"; X = as.integer(row_sums(dtm[,terms]) > 0)
make.chart()
3.4 某一群(個)字
To analyze the effect of words, we need a new document term matrix (dtm2).
corp = Corpus(VectorSource(review$text))
corp = tm_map(corp, content_transformer(tolower))
corp = tm_map(corp, removePunctuation)
dtm2 = DocumentTermMatrix(corp); dtm2
<<DocumentTermMatrix (documents: 215879, terms: 215402)>>
Non-/sparse entries: 15740971/46485027387
Sparsity : 100%
Maximal term length: 932
Weighting : term frequency (tf)
dtm2 = removeSparseTerms(dtm2, .999); dtm2
<<DocumentTermMatrix (documents: 215879, terms: 5230)>>
Non-/sparse entries: 14361152/1114686018
Sparsity : 99%
Maximal term length: 15
Weighting : term frequency (tf)
y = "useful"; Y = review[, y] %>% {. > median(.)}
w = which(colnames(dtm2) %in% c('but','however','nonetheless'))
x = "but..."; X = as.integer(row_sums(dtm2[, w]) > 0)
make.chart(bygroup=TRUE)
3.5 標點符號
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "?"; X = grepl("?",review$text,fixed=T)
make.chart(bygroup=TRUE)
More than one Punctuation …
x = "? ..."; X = grepl("\\?|!",review$text)
make.chart()
3.6 情緒
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "anger"; X = senti[, x] %>% {. > median(.)}
make.chart(bygroup=TRUE)
x = "positive"; X = senti[, x] %>% {. > median(.)}
make.chart()
x = "negative"; X = senti[, x] %>% {. > median(.)}
make.chart()
---
title: "內容和字詞對評論的效果"
subtitle: Yelp Kaggle, Effect Explorer
author: "Tony Chuo"
date: "2017/08/04"
output:
  html_notebook:
    highlight: textmate
    theme: lumen
---

<br>

- - -

<br>
這一段程式可以讓同學們在動態網頁上觀察：

+ 某一種(群)內容 (194 classes)，
+ 某一個(群)字詞，或
+ 某一個(群)字根
+ 某一種情緒

對於評論的：

+ useful
+ funny
+ cool

的效果。<br> <br>
透過互動式的圖形，
我們也可以觀察這些效果是否會隨商業類別(508 categories)發生變化。<br>

```{r set-options, echo=FALSE, cache=FALSE}
library(knitr)
options(width=90)
opts_chunk$set(comment = NA)
```

```{r results='asis', warning=F, message=F, cache=F}
Sys.setlocale('LC_ALL','C')
library(magrittr)
library(highcharter)
library(slam)
library(tm)
library(SnowballC)
library(RColorBrewer)          
# set color palette
pals = c(brewer.pal(8,"Set2")[c(6)],
         brewer.pal(8,"Dark2"),
         brewer.pal(8,"Set1")[c(1)])
# load data
load('data/yelp1.rdata')
load('data/average.rdata')
load('data/empath.rdata')
load('data/dtm0.rdata')
```
<br>

## (1) Preparation
To aviod glitches, we only analyze the business categories with more than 500 reviews. <br> First we do a quick hi-clustering on these (123) categories by their normolized class scores.  
```{r}
# clusters of categories by classes scores
d = dist(scale(wx[CA$nrev>500,]))
hc = hclust(d,"ward.D2")
```

Cut them into 10 groups.  
```{r}
kg = cutree(hc,10)
kg = order(order(-table(kg)))[kg]
table(kg)
```

Name the groups and combine the group_id with the category profiles. 
```{r}
gnames = c('Restaurant','Bar', 'Shop', 'Venue', 'Service', 
           'Art', 'Beauty', 'Fitness', 'Health', 'Public')
cats = cbind( cat=rownames(CA)[CA$nrev>500], 
              grp=factor(kg,labels=gnames), 
              CA[CA$nrev>500,] )
head(cats)
```
<br>

## (2) Helper Function
Make a helper function for interactive charting ...
```{r}
# helper function
make.chart = function(bygroup=F) { # x,X,y,Y,mxBCm=,review
  # overall usage and lift
  N = as.numeric(length(X))
  txt = sprintf("OVERALL: Usage=%.3f, Conf=%.3f, Base=%.3f, Lift=%.3f", 
                sum(X)/N, sum(X&Y)/sum(X), sum(Y)/N,
                N*sum(Y&X)/sum(X)/sum(Y))

  if(bygroup) {
    mx = sapply(levels(cats$grp), function(g) 
      rowSums(mxBC[, as.character(cats$cat[cats$grp == g])]) > 0 )
    bubbles = list(maxSize="20%",minSize=20)
    df2 = aggregate(.~grp,cats[,2:4],sum) }
  else {
    mx = mxBC[,CA$nrev>500]
    bubbles = list(maxSize="10%",minSize=10)
    df2 = cats[,1:4] }

  # cases
  df = mx %>% apply(2,function(v) {
      i = review$bid %in% rownames(mxBC)[v]
      n = as.numeric(sum(i))
      c( usage = sum(X[i])/n,
         base = round(sum(Y[i])/n, 3),
         conf = round(sum(X[i]&Y[i])/sum(X[i]), 3),
         lift = n * sum(Y[i]&X[i]) / sum(X[i]) / sum(Y[i]) )
    }) %>% t %>% data.frame %>% cbind(df2)
  
  # bubble chart
  tips=paste0(ifelse(bygroup, "", "<b>{point.cat}</b><br>"),
              "no.rev/biz: {point.nrev} / {point.nbiz}<br>",
              "conf/base: {point.conf} / {point.base}")
  hchart(df,"scatter",hcaes(x=usage,y=lift,size=nrev,group=grp)) %>%
    hc_title(text=sprintf("The effect of <b>%s</b> on <b>%s</b>",x,y)) %>% 
    hc_subtitle(text = txt) %>% hc_colors(pals) %>% 
    hc_tooltip(hideDelay=100,useHTML=T,pointFormat=tips) %>% 
    hc_plotOptions(bubble = bubbles) %>% hc_size(height=640) %>% 
    hc_chart(zoomType="xy") %>% hc_add_theme(hc_theme_flat())
}
```
<br>

## (3) Explore the Effect of Content Classes

### 3.1 某一種內容
```{r results='asis', fig.align='center'}
y = "funny";  Y = review[, y] %>% {. > median(.)}
x = "swearing_terms"; X = scores[, x] %>% {. > median(.)}
make.chart()
```
<br>
For the entire corpus and each business categories, we calculate and display:

+ `Usage` = $P[X]$ : the base probability of $X$
+ `Conf` = $P[Y|X]$ : the probability of $Y$ given $X$  
+ `Base` = $P[Y]$ : the base probability of $Y$ 
+ `Lift` = $\frac{P[Y|X]}{P[Y]}$ : the lift of $X$ on $P[Y]$
<br>
<br>

### 3.2 某一群內容
可以一次看一群內容，也可以將整群商業類別集合起來一起看 ...
```{r results='asis', fig.align='center'}
y = "funny"; Y = review[, y] %>% {. > median(.)}
x = "sexual+lust"
X = rowSums(sapply(c('sexual','lust'), # <-- 將內容放在括弧裡面
                   function(s) scores[,s] > median(scores[,s]))) > 0 
make.chart(bygroup=TRUE)
```
<br>
<br>

### 3.3 某一群(個)字根
```{r results='asis', fig.align='center'}
y = "cool"; Y = review[, y] %>% {. > median(.)}
# 將字根放在括弧裡面
terms = grep("^(authen|genuin|pure|innoc|origin|true|truth)",
             dtm$dimnames$Terms[1:10000], value=F)
x = "authenticity"; X = as.integer(row_sums(dtm[,terms]) > 0)
make.chart()
```
<br>
<br>

### 3.4 某一群(個)字
To analyze the effect of words, we need a new document term matrix (`dtm2`).
```{r}
corp = Corpus(VectorSource(review$text))
corp = tm_map(corp,  content_transformer(tolower))
corp = tm_map(corp, removePunctuation)
dtm2 = DocumentTermMatrix(corp); dtm2
dtm2 = removeSparseTerms(dtm2, .999); dtm2
```


```{r results='asis', fig.align='center'}
y = "useful"; Y = review[, y] %>% {. > median(.)}
w = which(colnames(dtm2) %in% c('but','however','nonetheless'))
x = "but..."; X = as.integer(row_sums(dtm2[, w]) > 0)
make.chart(bygroup=TRUE)
```
<br>
<br>


### 3.5 標點符號
```{r results='asis', fig.align='center'}
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "?"; X = grepl("?",review$text,fixed=T)
make.chart(bygroup=TRUE)
```
<br>
<br>

More than one Punctuation ...
```{r}
x = "? ..."; X = grepl("\\?|!",review$text)
make.chart()
```
<br>
<br>

### 3.6 情緒
```{r results='asis', fig.align='center'}
y = "useful"; Y = review[, y] %>% {. > median(.)}
x = "anger"; X = senti[, x] %>% {. > median(.)}
make.chart()
```

```{r results='asis', fig.align='center'}
x = "positive"; X = senti[, x] %>% {. > median(.)}
make.chart()
```

```{r results='asis', fig.align='center'}
x = "negative"; X = senti[, x] %>% {. > median(.)}
make.chart()
```

