https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
アパレルECサイトの商品レビューデータから、どのような投稿をすればより多くの“Positive Feedback”を獲得できるのかを分析する。
“Positive Feedback”とは、投稿したレビューが他のユーザーから「有益である」と判定された時に獲得できる。

library

library(dplyr)
library(tidyverse)
library(tidytext)
library(stringr)
library(ggplot2)
library(tidyr)
library(scales)
library(wordcloud)
library(reshape2)
library(igraph)
library(ggraph)
library(widyr)
library(purrr)
library(psych)
library(MASS)
library(SnowballC)
library(glmmML)
library(topicmodels)
library(ldatuning)
library(rpart)
library(partykit)
library(randomForest)
library(wordcloud)
library(wordcloud2)
library(ggrepel)
library(car)
library(broom)
library(pscl)
library(mpath)
library(webshot)
library(htmlwidgets)
library(DT)
library(htmltools)
library(gridExtra)
library(knitr)
library(devtools)
library(makedummies)
webshot::install_phantomjs()

loading

a_row <- read.csv("Womens Clothing E-Commerce Reviews.csv", 
                  stringsAsFactors = FALSE, sep = ",")

b_data <- a_row %>% 
  rename(no = X,
         item_id = Clothing.ID,
         age = Age,
         title = Title,
         review = Review.Text,
         rate = Rating,
         recommend = Recommended.IND,
         feedback = Positive.Feedback.Count,
         division = Division.Name,
         department = Department.Name,
         class = Class.Name)

b_data$no <- as.factor(b_data$no)
b_data$item_id <- as.factor(b_data$item_id)

overall

b_data %>% 
  head(2) %>% 
  DT::datatable()

str(b_data)

## 'data.frame':    23486 obs. of  11 variables:
##  $ no        : Factor w/ 23486 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ item_id   : Factor w/ 1206 levels "0","1","2","3",..: 768 1081 1078 1050 848 1081 859 859 1078 1078 ...
##  $ age       : int  33 34 60 50 47 49 39 39 24 34 ...
##  $ title     : chr  "" "" "Some major design flaws" "My favorite buy!" ...
##  $ review    : chr  "Absolutely wonderful - silky and sexy and comfortable" "Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have"| __truncated__ "I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small "| __truncated__ "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!" ...
##  $ rate      : int  4 5 3 5 5 2 5 4 5 5 ...
##  $ recommend : int  1 1 0 1 1 0 1 1 1 1 ...
##  $ feedback  : int  0 4 0 0 6 4 1 4 0 0 ...
##  $ division  : chr  "Initmates" "General" "General" "General Petite" ...
##  $ department: chr  "Intimate" "Dresses" "Dresses" "Bottoms" ...
##  $ class     : chr  "Intimates" "Dresses" "Dresses" "Pants" ...

summary(b_data %>% dplyr::select(-c(no, item_id)))

##       age          title              review               rate      
##  Min.   :18.0   Length:23486       Length:23486       Min.   :1.000  
##  1st Qu.:34.0   Class :character   Class :character   1st Qu.:4.000  
##  Median :41.0   Mode  :character   Mode  :character   Median :5.000  
##  Mean   :43.2                                         Mean   :4.196  
##  3rd Qu.:52.0                                         3rd Qu.:5.000  
##  Max.   :99.0                                         Max.   :5.000  
##    recommend         feedback         division          department       
##  Min.   :0.0000   Min.   :  0.000   Length:23486       Length:23486      
##  1st Qu.:1.0000   1st Qu.:  0.000   Class :character   Class :character  
##  Median :1.0000   Median :  1.000   Mode  :character   Mode  :character  
##  Mean   :0.8224   Mean   :  2.536                                        
##  3rd Qu.:1.0000   3rd Qu.:  3.000                                        
##  Max.   :1.0000   Max.   :122.000                                        
##     class          
##  Length:23486      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

item_idの分布を確認する。

b_data %>% 
  count(item_id) %>% 
  ggplot(aes(x = n)) +
  geom_histogram()

1つしかないものが多い。
100以上のもののみ抽出し、説明変数に入れても良いが、今回はこの後item_idそのものを除外する。

最も多く出現したitem_idを抽出。

b_data %>% 
  semi_join(b_data %>% 
              count(item_id) %>% 
              top_n(1, n), by = "item_id") %>%
  summarise(item_id = first(item_id),
            rate_avg = mean(rate),
            recommend_avg = mean(recommend),
            feedback_avg = mean(feedback))

processing

unique check

ひとつのitem_idに対して複数のdivision,department,classが振られていないか確認する。
最初にカテゴリーの内容確認。

b_data %>% 
  dplyr::select(class) %>% 
  unique() %>% 
  filter(class != "") %>% 
  mutate(row_num = row_number()) %>% 
  left_join(b_data %>% 
              dplyr::select(department) %>% 
              unique() %>% 
              filter(department != "") %>% 
              mutate(row_num = row_number()),
            by = "row_num") %>% 
  left_join(b_data %>% 
              dplyr::select(division) %>% 
              unique() %>% 
              filter(division != "") %>% 
              mutate(row_num = row_number()),
            by = "row_num") %>% 
  dplyr::select(-row_num) %>% 
    arrange(division, department, class) %>% 
  mutate_all(funs(replace(., is.na(.), ""))) %>% 
  dplyr::select(division, department, class) %>% 
  datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))

n_distinct(b_data$item_id)

## [1] 1206

n_distinct(b_data$item_id, b_data$division)

## [1] 1402

n_distinct(b_data$item_id, b_data$department)

## [1] 1206

n_distinct(b_data$item_id, b_data$class)

## [1] 1207

division,classを加えるとユニークにならない。

classがユニークになっていないitem_idを確認する。

b_data %>% 
  count(item_id, class) %>% 
  count(item_id) %>% 
  filter(nn > 1) %>% 
  head() %>% 
  data.frame()

b_data %>% 
  filter(item_id == 1119) %>%
  dplyr::select(item_id, division, department, class) %>% 
  head()

原因は不明。ユニークになっているdepartmentのみを使用し、divisionとclassは削除。

b_data <- b_data %>% 
  dplyr::select(-division, -class)

missing values

NAの数を確認。

b_data %>% 
  sapply(is.na) %>% 
  colSums()

##         no    item_id        age      title     review       rate 
##          0          0          0          0          0          0 
##  recommend   feedback department 
##          0          0          0

空欄の数を確認。

b_data %>% 
  sapply(function(x){
    x == ""
  }) %>% 
  colSums()

##         no    item_id        age      title     review       rate 
##          0          0          0       3810        845          0 
##  recommend   feedback department 
##          0          0         14

titleが空欄の影響。

b_data %>% 
  group_by(title == "") %>% 
  summarise(mean_fb = mean(feedback),
            median_fb = median(feedback))

b_data %>%
  mutate(title_blank = ifelse(title == "", "blank", "filled")) %>%
  group_by(title_blank) %>% 
  mutate(title_blank_n = n()) %>% 
  ungroup() %>% 
  count(feedback, title_blank_n, title_blank) %>% 
  mutate(ratio = n / title_blank_n) %>% 
  ggplot(aes(x = feedback, y = ratio, fill = title_blank)) +
  geom_col(position = "dodge", alpha = 0.7, width = 1) +
  xlim(-1, 20)

reviewの空欄。

b_data %>%
  mutate(review_blank = ifelse(review == "", "blank", "filled")) %>%
  group_by(review_blank) %>% 
  mutate(review_blank_n = n()) %>% 
  ungroup() %>% 
  count(feedback, review_blank_n, review_blank) %>% 
  mutate(ratio = n / review_blank_n) %>% 
  dplyr::select(feedback, ratio, review_blank) %>% 
  ggplot(aes(x = feedback, y = ratio, fill = review_blank)) +
  geom_col(position = "dodge", alpha = 0.7, width = 1) +
  xlim(-1, 20)

reviewが空欄の場合、feedbackが付かないので除外する。

b_data %>% 
  filter(review == "") %>% 
  count(feedback)

b_data <- b_data %>% 
  filter(review != "")

b_data %>% 
  sapply(function(b_data){
    b_data == ""
  }) %>% 
  colSums()

##         no    item_id        age      title     review       rate 
##          0          0          0       2966          0          0 
##  recommend   feedback department 
##          0          0         13

departmentの空欄について確認する。

b_data %>% 
  filter(department == "")

item_idでdepartmentを特定できないか検証。

b_data %>% 
  semi_join(b_data %>% 
              filter(department == ""),
            by = "item_id") %>% 
  dplyr::select(item_id, department) %>% 
  filter(department != "")

departmentが空欄のitem_idは、全てdepartmentが空欄であるので、特定はできない。

departmentの空欄は“blank”を代入。

b_data$department[b_data$department == ""] <- "blank"

b_data %>% 
  sapply(function(b_data){
    b_data == ""
  }) %>% 
  colSums()

##         no    item_id        age      title     review       rate 
##          0          0          0       2966          0          0 
##  recommend   feedback department 
##          0          0          0

b_data %>% 
  str()

## 'data.frame':    22641 obs. of  9 variables:
##  $ no        : Factor w/ 23486 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ item_id   : Factor w/ 1206 levels "0","1","2","3",..: 768 1081 1078 1050 848 1081 859 859 1078 1078 ...
##  $ age       : int  33 34 60 50 47 49 39 39 24 34 ...
##  $ title     : chr  "" "" "Some major design flaws" "My favorite buy!" ...
##  $ review    : chr  "Absolutely wonderful - silky and sexy and comfortable" "Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have"| __truncated__ "I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small "| __truncated__ "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!" ...
##  $ rate      : int  4 5 3 5 5 2 5 4 5 5 ...
##  $ recommend : int  1 1 0 1 1 0 1 1 1 1 ...
##  $ feedback  : int  0 4 0 0 6 4 1 4 0 0 ...
##  $ department: chr  "Intimate" "Dresses" "Dresses" "Bottoms" ...

correlation

cor(b_data[, which(sapply(b_data, is.integer))],
    use = "pairwise.complete.obs")

##                  age        rate   recommend    feedback
## age       1.00000000  0.02996163  0.03420786  0.04085044
## rate      0.02996163  1.00000000  0.79256807 -0.06098444
## recommend 0.03420786  0.79256807  1.00000000 -0.06592336
## feedback  0.04085044 -0.06098444 -0.06592336  1.00000000

pairs.panels(b_data[, which(sapply(b_data, is.integer))],
             lm = TRUE, ellipses = FALSE, stars = TRUE, density = FALSE)

多重共線性を考慮するとrecommendを除外するべきか？
feedbackとの間に線形関係は見られない。
線形回帰は難しいか。

target variable

b_data %>% 
  count(feedback) %>% 
  ggplot(aes(x = feedback, y = n)) +
  geom_col(width = 1)

feedback40以下に絞る。

b_data %>% 
  count(feedback) %>% 
  ggplot(aes(x = feedback, y = n)) +
  geom_col(width = 1) +
  xlim(-1, 40)

describe(b_data$feedback)

非負
離散値
上限無し
平均 < 分散
→負の二項分布？

tokenisation

c_token <- b_data %>% 
  unnest_tokens(word, review)

c_token %>% 
  filter(no == 0) %>% 
  dplyr::select(word) %>% 
  mutate(row_num = row_number()) %>% 
    left_join(b_data %>% 
                filter(no == 0) %>% 
                dplyr::select(review) %>% 
                mutate(row_num = 1),
              by = "row_num") %>% 
  dplyr::select(review, word) %>% 
  dplyr:::rename(before_token = review,
                 after_token = word) %>% 
   datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))

stemming

stemmingで変化した単語のうち数の多いもの。

c_token %>% 
  mutate(word_stem = wordStem(word, language = "en")) %>% 
  filter(word != word_stem) %>% 
  group_by(word_stem) %>% 
  mutate(word_stem_n = n()) %>% 
  ungroup() %>% 
  group_by(word, word_stem, word_stem_n) %>% 
  summarise(word_n = n()) %>%
  dplyr::select(word, word_n, word_stem, word_stem_n) %>% 
  arrange(desc(word_stem_n)) %>% 
  head(20) %>% 
  datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))

stopwordsを除外した後のstemming上位。

c_token %>% 
  anti_join(stop_words) %>% 
  mutate(word_stem = wordStem(word, language = "en")) %>% 
  filter(word != word_stem) %>% 
  group_by(word_stem) %>% 
  mutate(word_stem_n = n()) %>% 
  ungroup() %>% 
  group_by(word, word_stem, word_stem_n) %>% 
  summarise(word_n = n()) %>%
  dplyr::select(word, word_n, word_stem, word_stem_n) %>% 
  arrange(desc(word_stem_n)) %>% 
  head(20) %>% 
  datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))

c_token <- c_token %>% 
  mutate(word_stem = wordStem(word, language = "en")) %>% 
  rename(word_ori = word,
         word = word_stem)

sentiment

感情辞書の中身を確認。

bind_cols(get_sentiments("afinn") %>% head(10) %>% dplyr::rename(word_afinn = word,
                                                               score_afinn = score),
          get_sentiments("bing") %>% head(10) %>% dplyr::rename(word_bing = word,
                                                              sent_bing = sentiment),
          get_sentiments("nrc") %>% head(10) %>% dplyr::rename(word_nrc = word,
                                                             sent_nrc = sentiment)) %>% 
  DT::datatable(rownames = FALSE)

各辞書を今回のデータに紐付けた場合の上位。

bind_rows(c_token %>% 
            inner_join(get_sentiments("afinn"), by = "word") %>% 
            mutate(sentiment = case_when(score > 0 ~ "positive",
                                         score < 0 ~ "negative"),
                   method = "afinn") %>% 
            filter(!is.na(sentiment)),
          c_token %>% 
            inner_join(get_sentiments("bing"), by = "word") %>% 
            mutate(method = "bing"),
          c_token %>% 
            inner_join(get_sentiments("nrc"), by = "word") %>% 
            filter(sentiment %in% c("positive", "negative")) %>% 
            mutate(method = "nrc")
) %>% 
  count(word, sentiment, method) %>% 
  group_by(sentiment, method) %>% 
  top_n(5) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(sentiment ~ method, scales = "free") +
  coord_flip()

各辞書を今回のデータに紐付けた場合のposinegaの単語数比較。

bind_rows(c_token %>% 
            left_join(get_sentiments("afinn"), by = "word") %>% 
            group_by(no) %>% 
            summarise(score = sum(score, na.rm = TRUE)) %>% 
            mutate(method = "afinn"),
          c_token %>% 
            left_join(get_sentiments("bing"), by = "word") %>% 
            count(no, sentiment) %>%
            mutate(n = case_when(sentiment == "positive" ~ n,
                                 sentiment == "negative" ~ -n)) %>% 
            group_by(no) %>%
            summarise(score = sum(n, na.rm = TRUE)) %>% 
            mutate(method = "bing"),
          c_token %>% 
            left_join(get_sentiments("nrc") %>% 
                        filter(sentiment %in% c("positive", "negative")),
                      by = "word") %>% 
            count(no, sentiment) %>%
            mutate(n = case_when(sentiment == "positive" ~ n,
                                 sentiment == "negative" ~ -n)) %>% 
            group_by(no) %>%
            summarise(score = sum(n, na.rm = TRUE)) %>% 
            mutate(method = "nrc")
) %>% 
  mutate(score_colour = case_when(score < 0 ~ "a",
                                  score == 0 ~ "b",
                                  score > 0 ~ "c")) %>% 
  ggplot(aes(x = score, fill = method)) + # , fill = score_colour
  #  geom_histogram(binwidth = 1, show.legend = FALSE) +
  stat_density(bw = 1) +
  facet_wrap( ~ method, scales = "free") +
  geom_vline(xintercept = 0) +
  scale_y_continuous(labels = percent_format())

bind_rows(
get_sentiments("afinn") %>% 
  count() %>% 
  mutate(method = "afinn"),
get_sentiments("bing") %>% 
  count() %>% 
  mutate(method = "bing"),
get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative"))%>%
  count() %>% 
  mutate(method = "nrc")
) %>% 
  dplyr::select(method, n)

感情辞書afinnの中身を確認。

c_token %>% 
  inner_join(get_sentiments("afinn"), by = "word") %>% 
  count(word, score) %>% 
  group_by(score) %>% 
  top_n(5, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = score)) +
  geom_col(show.legend = FALSE) +
  facet_wrap( ~ score, scales = "free", ncol = 3) +
  coord_flip()

今回は感情の項目がposinegaのみと、単純なbingを使用する。

c_token <- c_token %>% 
  left_join(get_sentiments("bing"), by = "word")

bingを適用してみて、posinegaの数が多いものを確認。

c_token %>% 
  filter(!is.na(sentiment)) %>% 
  count(word, sentiment) %>% 
  group_by(sentiment) %>% 
  top_n(10, n) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free")

不適切な単語を除外する。

d_typical_sent <- b_data %>% 
  dplyr::select(no, review) %>% 
  unnest_tokens(review, review, token = "regex", pattern = "\\.") %>% 
  filter(str_detect(review, c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark")))

c_token <- c_token %>% 
  mutate(sentiment = if_else(word %in% c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark"), as.character(NA), sentiment))

除外後に再度感情の上位を確認。

c_token %>% 
  filter(!is.na(sentiment)) %>% 
  count(word, sentiment) %>% 
  group_by(sentiment) %>% 
  top_n(10, n) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free")

negation

否定語の影響を考慮する。

e_negation <- b_data %>%
  dplyr::select(no, review) %>% 
  unnest_tokens(bigrams, review, token = "ngrams", n = 2) %>% 
  separate(bigrams, c("word1", "word2"), sep = " ") %>% 
  mutate(word1 = wordStem(word1, language = "en"),
         word2 = wordStem(word2, language = "en")) %>% 
  filter((word1 %in% c("no", "not", "never", "without")) |
           str_detect(word1, "n't")) %>% 
  inner_join(get_sentiments("bing"), by = c("word2" = "word")) %>% 
  filter(!word2 %in% c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark"))

e_negation %>% 
  count(word1, sort = TRUE) %>% 
  head(10) %>% 
  DT::datatable(rownames = FALSE)

e_negation %>% 
  count(word2, sort = TRUE) %>% 
  head(10)

否定語の後に使われている単語ランキング。

e_negation %>% 
  count(word2, sentiment) %>% 
  top_n(20, n) %>% 
  mutate(word = reorder(word2, n)) %>% 
  ggplot(aes(x = word, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ sentiment, scales = "free")

否定語の直後に出現している単語の感情スコアの正負を逆転して再計算する。

f_negation_smr <- e_negation %>% 
  count(no, sentiment) %>% 
  spread(key = sentiment, value = n, fill = 0) %>% 
  dplyr::rename(positive_inv = positive,
                negative_inv = negative)

g_sent_smr <- c_token %>% 
  count(no, feedback, rate, sentiment) %>% 
  spread(key = sentiment, value = n, fill = 0) %>% 
  dplyr::rename(neutral = "<NA>") %>% 
  left_join(f_negation_smr, by = "no") %>% 
  #  mutate_all(funs(if_else(is.na(.), as.integer(0), as.integer(.)))) %>% 
  mutate_all(funs(replace(., is.na(.), 0))) %>% 
  mutate(posi_total = positive - positive_inv + negative_inv,
         nega_total = negative - negative_inv + positive_inv,
         sent_total = posi_total - nega_total)

否定語を考慮した後の感情スコア分布。

g_sent_smr %>% 
  mutate(sent_colour = case_when(
    sent_total < 0 ~ "a",
    sent_total == 0 ~ "b",
    sent_total > 0 ~ "c"
  )) %>% 
  ggplot(aes(x = sent_total, fill = sent_colour)) +
  geom_histogram(show.legend = FALSE, binwidth = 1)

最も多くpositive wordが使われているreview。

b_data %>% 
  inner_join(g_sent_smr %>% 
               top_n(1, sent_total) %>% 
               mutate(no = as.factor(no)),
             by = "no") %>% 
  dplyr::select(review, sent_total)

最もnegative。

b_data %>% 
  inner_join(g_sent_smr %>% 
               top_n(-1, sent_total) %>% 
               mutate(no = as.factor(no)),
             by = "no") %>% 
  dplyr::select(review, sent_total) %>% 
  DT::datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 1))

どの単語がnegativeなのか分からないので、確認。

c_token %>% 
  inner_join(g_sent_smr %>% 
               top_n(1, desc(sent_total)),
             by = "no") %>% 
  filter(!is.na(sentiment)) %>% 
  dplyr::select(no, word, sentiment) %>% 
  DT::datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 7))

embroidery:かぎ裂き
snag:かぎ裂きを作る

feedbackとsentimentの線形関係を確認。

g_sent_smr %>% 
  ggplot(aes(sent_total, feedback)) +
  geom_point() +
  #geom_abline(colour = "red")
  geom_smooth(method = "lm", colour = "red")

cor(g_sent_smr$feedback, g_sent_smr$sent_total)

## [1] -0.003327681

絶対値であればそう感が強く出るか、確認。

g_sent_smr %>% 
  ggplot(aes(abs(sent_total), feedback)) +
  geom_point() +
  #geom_abline(colour = "red")
  geom_smooth(method = "lm", colour = "red")

cor(g_sent_smr$feedback, abs(g_sent_smr$sent_total))

## [1] 0.01046473

絶対値の方が相関は強く出た。しかし、今回はモデルには使用していない。

rateとsent_totalの関係確認。

g_sent_smr %>% 
  ggplot(aes(as.factor(rate), sent_total)) +
  geom_boxplot() +
  geom_abline(colour = "red") +
  labs(x = "rate")

cor(g_sent_smr$rate, g_sent_smr$sent_total)

## [1] 0.4009777

h_data_for_analysis <- b_data %>% 
  left_join(g_sent_smr %>% 
              dplyr::select(no, posi_total, nega_total, sent_total),
            by = "no") %>% 
  dplyr::select(-c(item_id, review)) %>% 
  mutate(title = if_else(title == "", 0, 1),
         department = as.factor(department),
         recommend = as.factor(recommend)) %>% 
  dplyr::rename(title_filled = title) %>% 
  inner_join(c_token %>% 
               count(no) %>% 
               dplyr::rename(word_cnt = n),
             by = "no")

posinegaを比率に変換し、feedbackとの線形関係を確認。

h_data_for_analysis %>% 
  mutate(posi_ratio = posi_total / word_cnt,
         nega_ratio = nega_total / word_cnt) %>% 
  gather(key = sent, value = ratio, posi_ratio, nega_ratio) %>% 
  ggplot(aes(ratio, feedback, fill = sent)) +
  geom_point(show.legend = FALSE) +
  stat_smooth(aes(group = sent, colour = sent), method = "lm", se = FALSE, fullrange = TRUE, show.legend = FALSE) +
  facet_wrap( ~ sent, scales = "free")

topic modeling

i_dtm <- c_token %>% 
  filter(!str_detect(word, "\\d")) %>% 
  anti_join(stop_words, by = "word") %>% 
  mutate(no = as.factor(no)) %>% 
  count(no, word) %>%
  cast_dtm(no, word, n)

j_for_topic <- i_dtm[apply(i_dtm, 1, sum) > 0, ]

適切なトピック数を推定。

# system.time(k_topic_number <- FindTopicsNumber(
#   j_for_topic,
#   topics = seq(from = 2, to = 10, by = 1),
#   metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
#   method = "Gibbs",
#   control = list(seed = 2018),
#   mc.cores = 2L,
#   verbose = TRUE
# ))

user	system	elapsed
1.905	1.388	551.116

FindTopicsNumber_plot(k_topic_number)

Griffiths2004……周辺尤度
CaoJuan2009……cos類似度
Arun2010……距離?
Deveaud2014……KLダイバージェンス

解釈性も考慮し、トピック数は4で進める。

l_lda <- LDA(j_for_topic, k = 4, control = list(seed = 2018))

γ値によってトピックによる分類度合いを確認。
γ値とは文書の構成単語の何割がそのトピックから発生したかを示す（？）

m_gamma <- l_lda %>% 
  tidy(matrix = "gamma") %>% 
  mutate(document = factor(document, levels = unique(c_token$no)))

m_gamma %>% 
  ggplot(aes(gamma)) +
  geom_histogram() +
  scale_y_log10() +
  labs(x = expression(gamma))

あまり分類されていない。

m_gamma %>% 
  ggplot(aes(gamma, fill = factor(topic))) +
  geom_histogram() +
  facet_wrap( ~ topic) +
  scale_y_log10() +
  labs(x = expression(gamma))

どの基準で別れたのか決定木で確認。

n_for_tree <- h_data_for_analysis %>% 
  left_join(m_gamma %>% 
              group_by(document) %>% 
              top_n(1, gamma) %>% 
              ungroup(),
            by = c("no" = "document")) %>% 
  dplyr::select(-gamma, -no) %>% 
  mutate(topic = factor(topic))

o_tree_res <- rpart(topic ~ ., data = n_for_tree,
                    method = "class", cp = 0.005)
plot(as.party(o_tree_res))

departmentによる分岐が強い？
department別にγ値を確認。

m_gamma %>% 
  left_join(h_data_for_analysis, by = c("document" = "no")) %>% 
  mutate(department = reorder(department, gamma * topic)) %>% 
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot() +
  facet_wrap( ~ department, nrow = 2) +
  labs(x = "topic number", y = expression(gamma))

あまり分かれていない。

p_beta <- l_lda %>% 
  tidy(matrix = "beta")

各トピックのβ値上位20単語。
β値とは単語分布のパラメータ（？）
トピックの解釈を試みる。

p_beta %>% 
  anti_join(stop_words, by = c("term" = "word")) %>% 
  group_by(topic) %>% 
  top_n(15, beta) %>% 
  ungroup() %>% 
  group_by(topic, term) %>% 
  arrange(desc(beta)) %>% 
  ungroup() %>% 
  mutate(term = reorder(str_c(term, topic, sep = "__"), beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap( ~ topic, scales = "free") +
  coord_flip() +
  scale_x_discrete(labels = function(x){
    gsub("__.+$", "", x)
  }) +
  labs(x = NULL, y = expression(beta))

stop_wordsを除外した上で、各トピックβ値上位20単語のβ値を比較し、トピックの解釈を行う。

p_beta %>% 
  anti_join(stop_words, by = c("term" = "word")) %>% 
  semi_join(p_beta %>% 
              anti_join(stop_words, by = c("term" = "word")) %>% 
              group_by(topic) %>% 
              top_n(10, beta) %>% 
              ungroup(), by = "term") %>% 
  group_by(topic) %>% 
  mutate(term = reorder(term, desc(beta))) %>% 
  ggplot(aes(x = factor(topic), y = beta)) +
  geom_bar(stat = "identity", show.legend = FALSE, aes(fill = topic)) +
  facet_wrap( ~ term, nrow = 2) +
  labs(x = "topic", y = expression(beta))

解釈の難しい1,4のみ抽出して比較。

p_beta %>% 
  anti_join(stop_words, by = c("term" = "word")) %>% 
  semi_join(p_beta %>% 
              anti_join(stop_words, by = c("term" = "word")) %>% 
              group_by(topic) %>% 
              top_n(10, beta) %>% 
              ungroup(), by = "term") %>% 
  group_by(topic) %>% 
  mutate(term = reorder(term, desc(beta))) %>% 
  filter(topic %in% c("1", "4")) %>% 
  ggplot(aes(x = factor(topic), y = beta)) +
  geom_bar(stat = "identity", show.legend = FALSE, aes(fill = topic)) +
  facet_wrap( ~ term, nrow = 2) +
  labs(x = "topic", y = expression(beta))

topic1–trialの値が高い。
topic2–top,skirt,sweaterとnegativeな単語の値が高い。
topic3–onlineとsizeの値が高い。
topic4–dressとpositiveな単語の値が高い。

以下、解釈がずれていないか確認を行う。

q_topic_data <- b_data %>% 
  inner_join(m_gamma %>% 
               group_by(document) %>% 
               top_n(1, gamma) %>% 
               ungroup(), by = c("no" = "document")) %>% 
  mutate(topic = topic %>% factor()) %>% 
  filter(!is.na(topic))

トピックごとのデータ概観。

q_topic_data %>% 
  group_by(topic) %>% 
  summarise(age_avg = mean(age) %>% round(2),
            rate_avg = mean(rate) %>% round(2),
            feedback_avg = mean(feedback) %>% round(2)) %>% 
  DT::datatable(rownames = FALSE)

love, veri, perfectの単語のβ値が高かったtopic1,4はrateが高い？

q_topic_data %>% 
  group_by(topic) %>% 
  summarise(rate_avg = mean(rate, na.rm = TRUE),
            rate_std = sd(rate, na.rm = TRUE))

若干だが平均が高く出ている。

dressのβ値が高いtopic4はdepartmentがdressの割合が高いか？

q_topic_data %>% 
  count(topic, department) %>% 
  group_by(topic) %>% 
  mutate(ratio = n / sum(n),
         department = reorder(department, ratio)) %>% 
  ungroup() %>% 
  ggplot(aes(x = topic, y = ratio, fill = department)) +
  geom_bar(stat = "identity", position = "fill") +
  scale_y_continuous(labels = percent) +
  geom_text_repel(aes(label = percent(ratio)), position = position_stack(vjust = 0.5), point.padding = 0, segment.size = 0)

高く出ている。

sizeの値が高いtopic3は、数字が多いか？

c_token %>% 
  inner_join(m_gamma %>% 
               group_by(document) %>% 
               top_n(1, gamma) %>% 
               ungroup(), by = c("no" = "document")) %>% 
  mutate(digit_bool = str_detect(word, "\\d")) %>%
  group_by(topic) %>%
  mutate(word_cnt = n()) %>%
  ungroup() %>%
  group_by(topic) %>% 
  summarise(digit_cnt = sum(digit_bool, na.rm = TRUE),
            digit_ratio = sum(digit_bool, na.rm = TRUE) / first(word_cnt)) %>% 
  DT::datatable(rownames = FALSE)

若干高い。topic4も高く出ている。

tf_idfによるトピック解釈.

c_token %>% 
  filter(!str_detect(word, "\\d")) %>% 
  left_join(m_gamma %>% 
              group_by(document) %>% 
              top_n(1, gamma) %>% 
              ungroup(), by = c("no" = "document")) %>%
  count(topic, word) %>% 
  filter(n >= 30) %>% 
  bind_tf_idf(word, topic, n) %>% 
  group_by(topic) %>% 
  top_n(10, tf_idf) %>% 
  ungroup() %>% 
  mutate(word = reorder(str_c(word, topic, sep = "__"), tf_idf)) %>% 
  ggplot(aes(word, tf_idf, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap( ~ topic, scales = "free") +
  coord_flip() +
  scale_x_discrete(labels = function(x){
    gsub("__.+$", "", x)
  })

tf_idfによるは解釈が困難。

network

トピック別に単語ネットワークを作成し解釈を試みる。

for(i in 1:4){
  set.seed(2018)
  xxx <- c_token %>% 
    anti_join(stop_words, by = "word") %>% 
    filter(!str_detect(word, "\\d")) %>% 
    left_join(m_gamma %>% 
                group_by(document) %>% 
                top_n(1, gamma) %>% 
                ungroup(), by = c("no" = "document")) %>% 
    filter(topic == i) %>% 
    count(no, word) %>% 
    filter(n >= 5) %>% 
    ungroup() %>% 
    pairwise_cor(word, no, sort = TRUE) %>% 
    filter(!near(correlation, 1)) %>% 
    graph_from_data_frame() %>% 
    ggraph(layout = "fr") +
    geom_edge_link(aes(alpha = correlation, width = correlation), colour = "lightgray") +
    scale_edge_width(range = c(0.1, 1)) +
    geom_node_point(size = 3, colour = "darkblue") +
    geom_node_text(aes(label = name), repel = TRUE) +
    labs(tite = str_c("topic_", i, sep = ""))
  
  
  plot(xxx)
}

wordcloud

topic_1

set.seed(2018)
my_cloud1 <- c_token %>% 
  anti_join(stop_words, by = "word") %>% 
  filter(!str_detect(word, "\\d"),
         !str_detect(word,"_")) %>% 
  left_join(m_gamma %>% 
              group_by(document) %>% 
              top_n(1, gamma) %>% 
              ungroup(), by = c("no" = "document")
  ) %>% 
  filter(topic == 1) %>% 
  count(word) %>% 
  filter(n >= 50) %>% 
  wordcloud2(size = 10, gridSize = 5)

saveWidget(my_cloud1, file = "tmp1.html", selfcontained = FALSE)
  img1 <- webshot::webshot("tmp1.html", file = "wc1.png", delay = 5, vwidth = 1000, vheight = 700)

tags$a(href = "tmp1.html",
       tags$img(src = img1, alt = ""))

topic_2

topic_3

topic_4

modeling

モデリング用のデータを作成。
posinegaのratioなど。

h_data_for_analysis <- h_data_for_analysis %>%
  left_join(m_gamma %>%
              mutate(topic = str_c("topic_", topic)) %>%
              spread(key = topic, value = gamma, fill = 0),
            by = c("no" = "document"))

t_data_for_modeling <- h_data_for_analysis %>% 
  mutate(posi_ratio = posi_total / word_cnt,
         nega_ratio = nega_total / word_cnt) %>% 
  dplyr::select(-c("no", "posi_total", "nega_total", "sent_total", "topic_4")) %>% 
  mutate(department = department %>% as.factor()) %>% 
  makedummies() %>% 
  filter(complete.cases(.) == TRUE)

平均と標準偏差を確認。

u_mean <- mean(t_data_for_modeling$feedback, na.rm = TRUE)
v_var <- var(t_data_for_modeling$feedback, na.rm = TRUE)
cat("mean", u_mean, "\nvar", v_var)

## mean 2.630654 
## var 33.48105

得られた数値を元に確率分布を描画。

t_data_for_modeling %>% 
  dplyr::select(feedback) %>% 
  mutate(data = "actual") %>%
  dplyr::rename(cnt = feedback) %>% 
  bind_rows(data_frame(cnt = rnbinom(n = 100000, size = 0.466, mu = u_mean)) %>% 
              mutate(data = "nbinom"),
            data_frame(cnt = rnorm(n = 100000, mean = u_mean, sd = sqrt(v_var))) %>%
              mutate(data = "norm")) %>%
  ggplot(aes(cnt, colour = data, linetype = data)) +
  geom_line(stat = "density", adjust = 2) +
  scale_linetype_manual(values = c("solid", "dashed", "longdash"))

linear

分布の確認。

t_data_for_modeling %>% 
  dplyr::select(feedback) %>% 
  mutate(data = "actual") %>% 
  dplyr::rename(cnt = feedback) %>% 
  bind_rows(data_frame(cnt = rnorm(100000, mean = u_mean, sd = sqrt(v_var))) %>% 
              mutate(data = "norm")) %>% 
  ggplot(aes(cnt, colour = data, linetype = data)) +
  geom_line(stat = "density", adjust = 2) +
  scale_linetype_manual(values = c("solid", "dashed"))

線形関係の確認。

cor(t_data_for_modeling %>% 
      dplyr::select(-feedback, feedback)) %>% 
  cor.plot(xlas = 2)

cor(t_data_for_modeling %>% 
      dplyr::select(-feedback, feedback)) %>% 
  round(1) %>% 
  DT::datatable(options = list(pageLength = 17))

多重共線性が疑われるrecommendを除外。

t_data_for_modeling <- t_data_for_modeling %>% 
  dplyr::select(-recommend)

w_lm <- lm(feedback ~ ., data = t_data_for_modeling)
summary(w_lm)

## 
## Call:
## lm(formula = feedback ~ ., data = t_data_for_modeling)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -5.826  -2.497  -1.321   0.316 117.667 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.462875   8.553998  -0.288 0.773410    
## age                  0.019248   0.003067   6.275 3.55e-10 ***
## title_filled        -0.398685   0.113297  -3.519 0.000434 ***
## rate                -0.239593   0.037512  -6.387 1.72e-10 ***
## department_Bottoms   0.278348   1.574529   0.177 0.859682    
## department_Dresses   1.087393   1.573744   0.691 0.489598    
## department_Intimate  0.290926   1.577643   0.184 0.853697    
## department_Jackets   0.829547   1.582142   0.524 0.600062    
## department_Tops      0.675787   1.572722   0.430 0.667423    
## department_Trend     1.032841   1.657329   0.623 0.533162    
## word_cnt             0.035001   0.001502  23.306  < 2e-16 ***
## topic_1             11.137219  13.013385   0.856 0.392102    
## topic_2              5.324439  12.505847   0.426 0.670290    
## topic_3             -4.430701  13.340933  -0.332 0.739807    
## posi_ratio          -2.708613   0.858376  -3.156 0.001604 ** 
## nega_ratio          -3.368905   2.446524  -1.377 0.168521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.663 on 22624 degrees of freedom
## Multiple R-squared:  0.0427, Adjusted R-squared:  0.04207 
## F-statistic: 67.28 on 15 and 22624 DF,  p-value: < 2.2e-16

x_lm_step <- stepAIC(w_lm)

summary(x_lm_step)

## 
## Call:
## lm(formula = feedback ~ age + title_filled + rate + department_Dresses + 
##     department_Jackets + department_Tops + word_cnt + posi_ratio, 
##     data = t_data_for_modeling)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -5.678  -2.500  -1.322   0.310 117.778 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.730399   0.244129   2.992  0.00278 ** 
## age                 0.019155   0.003064   6.252 4.13e-10 ***
## title_filled       -0.395565   0.113263  -3.492  0.00048 ***
## rate               -0.225125   0.035601  -6.323 2.61e-10 ***
## department_Dresses  0.794062   0.105750   7.509 6.19e-14 ***
## department_Jackets  0.522331   0.194861   2.681  0.00736 ** 
## department_Tops     0.381168   0.095546   3.989 6.65e-05 ***
## word_cnt            0.035096   0.001495  23.471  < 2e-16 ***
## posi_ratio         -2.556394   0.850877  -3.004  0.00266 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.663 on 22631 degrees of freedom
## Multiple R-squared:  0.04245,    Adjusted R-squared:  0.04211 
## F-statistic: 125.4 on 8 and 22631 DF,  p-value: < 2.2e-16

car::vif(x_lm_step)

##                age       title_filled               rate 
##           1.006958           1.030972           1.113887 
## department_Dresses department_Jackets    department_Tops 
##           1.561154           1.133824           1.590773 
##           word_cnt         posi_ratio 
##           1.298906           1.383510

多重共線性は見られない。

回帰診断図。

par(mfrow = c(2, 2))
plot(x_lm_step)

par(mfrow = c(1, 1))

等分散性、独立性、正規性に問題？

negative binomial

t_data_for_modeling %>% 
  dplyr::select(feedback) %>% 
  rename(cnt = feedback) %>% 
  mutate(data = "actual") %>% 
  bind_rows(data_frame(cnt = rnbinom(100000, size = 0.466, mu = u_mean)) %>% 
              mutate(data = "nb")) %>% 
  ggplot(aes(cnt, colour = data, linetype = data)) +
  geom_line(stat = "density", adjust = 2) +
  scale_linetype_manual(values = c("solid", "dashed"))

z_nb <- glm.nb(feedback ~ . ,data = t_data_for_modeling)
summary(z_nb)

## 
## Call:
## glm.nb(formula = feedback ~ ., data = t_data_for_modeling, init.theta = 0.397347153, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5712  -1.1834  -0.5919   0.0616   5.6829  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.1263093  2.5934486  -0.434   0.6641    
## age                  0.0077052  0.0009325   8.263  < 2e-16 ***
## title_filled        -0.1660607  0.0346953  -4.786 1.70e-06 ***
## rate                -0.0887924  0.0113392  -7.831 4.86e-15 ***
## department_Bottoms   0.9104660  0.6089563   1.495   0.1349    
## department_Dresses   1.2275413  0.6087359   2.017   0.0437 *  
## department_Intimate  0.8818906  0.6097637   1.446   0.1481    
## department_Jackets   1.0988041  0.6107269   1.799   0.0720 .  
## department_Tops      1.0566013  0.6085018   1.736   0.0825 .  
## department_Trend     1.2244363  0.6280347   1.950   0.0512 .  
## word_cnt             0.0138863  0.0004593  30.236  < 2e-16 ***
## topic_1              3.4105958  3.8972303   0.875   0.3815    
## topic_2             -0.5400880  3.7509793  -0.144   0.8855    
## topic_3             -1.2775170  4.0004741  -0.319   0.7495    
## posi_ratio          -2.0547557  0.2816983  -7.294 3.01e-13 ***
## nega_ratio          -0.9477514  0.7546057  -1.256   0.2091    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3973) family taken to be 1)
## 
##     Null deviance: 23454  on 22639  degrees of freedom
## Residual deviance: 21650  on 22624  degrees of freedom
## AIC: 88169
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.39735 
##           Std. Err.:  0.00526 
## 
##  2 x log-likelihood:  -88135.38400

過分散検定。

odTest(z_nb)

## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
## 
## Critical value of test statistic at the alpha= 0.05 level: 2.7055 
## Chi-Square Test Statistic =  78260.8141 p-value = < 2.2e-16

有意と出たのでポアソン回帰より、負の二項回帰が適切。

za_nb_step <- stepAIC(z_nb)

summary(za_nb_step)

## 
## Call:
## glm.nb(formula = feedback ~ age + title_filled + rate + department_Bottoms + 
##     department_Dresses + department_Intimate + department_Jackets + 
##     department_Tops + department_Trend + word_cnt + topic_1 + 
##     posi_ratio, data = t_data_for_modeling, init.theta = 0.3972941982, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5668  -1.1834  -0.5914   0.0621   5.6956  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -1.7854655  0.9571858  -1.865   0.0621 .  
## age                  0.0076721  0.0009321   8.231  < 2e-16 ***
## title_filled        -0.1644837  0.0346929  -4.741 2.13e-06 ***
## rate                -0.0842510  0.0107621  -7.828 4.94e-15 ***
## department_Bottoms   0.9095515  0.6087028   1.494   0.1351    
## department_Dresses   1.2280775  0.6084274   2.018   0.0435 *  
## department_Intimate  0.8814041  0.6095127   1.446   0.1482    
## department_Jackets   1.0971688  0.6104589   1.797   0.0723 .  
## department_Tops      1.0558604  0.6082441   1.736   0.0826 .  
## department_Trend     1.2240462  0.6277768   1.950   0.0512 .  
## word_cnt             0.0139058  0.0004579  30.372  < 2e-16 ***
## topic_1              4.1020691  2.9337066   1.398   0.1620    
## posi_ratio          -2.0175157  0.2796614  -7.214 5.43e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.3973) family taken to be 1)
## 
##     Null deviance: 23451  on 22639  degrees of freedom
## Residual deviance: 21649  on 22627  degrees of freedom
## AIC: 88165
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.39729 
##           Std. Err.:  0.00526 
## 
##  2 x log-likelihood:  -88137.02500

car::vif(za_nb_step)

##                 age        title_filled                rate 
##            1.008240            1.031887            1.119594 
##  department_Bottoms  department_Dresses department_Intimate 
##          373.645470          566.983140          182.465315 
##  department_Jackets     department_Tops    department_Trend 
##          121.365576          692.331052           16.169450 
##            word_cnt             topic_1          posi_ratio 
##            1.285490            1.017052            1.376472

model comparison

mean absolute error

zb_mae_lm <- mean(abs(t_data_for_modeling$feedback - fitted.values(x_lm_step)))
zc_mae_nb <- mean(abs(t_data_for_modeling$feedback - fitted.values(za_nb_step)))
cat("MAE of lm is ", zb_mae_lm, "\nMAE of nb glm is ", zc_mae_nb)

## MAE of lm is  2.977924 
## MAE of nb glm is  2.974029

za_nb_step %>% 
  tidy() %>% 
  rename(estimate_nb = estimate,
         p.value_nb = p.value) %>%
  dplyr::select(term, estimate_nb, p.value_nb) %>% 
  mutate_if(is.double, funs(round(., digits = 2))) %>% 
  left_join(x_lm_step %>%
              tidy() %>%
              rename(estimate_lm = estimate,
                     p.value_lm = p.value) %>%
              dplyr::select(term, estimate_lm, p.value_lm) %>% 
              mutate_if(is.double, funs(round(., digits = 2))),
            by = "term") %>%
  dplyr::select(term, estimate_lm, p.value_lm, estimate_nb, p.value_nb) %>% 
  DT::datatable(options = list(pageLength = 13))

負の二項回帰の方がステップワイズ後に残った変数が多い。
titleは空欄の方が良い？
有意ではないがtopic_1（trial？）の話題を多く入れた方が良い。
reviewは単語数を多くして、感情を表現する単語は使わない方が良い。

title_filledについて確認。

t_data_for_modeling %>% 
  mutate(title_filled = title_filled %>% as.factor()) %>% 
  ggplot(aes(x = title_filled, y = feedback)) +
  geom_boxplot() +
  scale_y_continuous(limits = c(NA,10))

t_data_for_modeling %>% 
  group_by(title_filled) %>% 
  summarise(avg = mean(feedback),
            med = median(feedback)) %>% 
  DT::datatable(rownames = FALSE, options = list(pageLength = 2))

t_data_for_modeling %>% 
  group_by(title_filled) %>% 
  do(qtile = quantile(.$feedback, seq(0, 1, by = 0.1))) %>% 
  cbind(do.call(rbind, .$qtile)) %>% 
  dplyr::select(-qtile) %>% 
  DT::datatable(rownames = FALSE, options = list(pageLength = 2))

t_data_for_modeling %>% 
  mutate(title_filled = title_filled %>% as.factor()) %>% 
  ggplot(aes(x = title_filled, y = feedback, fill = title_filled)) +
  geom_violin() +
  scale_y_continuous(limits = c(NA, 10))

residuals

data.frame(actual = t_data_for_modeling$feedback,
       resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback, 
       resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>% 
  gather(key = method, value = residual, resi_lm, resi_nb) %>% 
  ggplot(aes(x = actual, y = residual, colour = method)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0) +
  facet_grid( ~ method)

grid.arrange(
  data.frame(actual = t_data_for_modeling$feedback,
             predict = fitted.values(x_lm_step)) %>%
    round(0) %>%
    gather(key = method, value = feedback, actual, predict) %>%
    count(method, feedback) %>%
    ggplot(aes(x = feedback, y = n, fill = method)) +
    geom_col(width = 1, position = "dodge") +
    xlim(c(-5, 10)) +
    labs(title = "linear"),

  data.frame(actual = t_data_for_modeling$feedback,
             predict = fitted.values(za_nb_step)) %>%
    round(0) %>%
    gather(key = method, value = feedback, actual, predict) %>%
    count(method, feedback) %>%
    ggplot(aes(x = feedback, y = n, fill = method)) +
    geom_col(width = 1, position = "dodge") +
    xlim(c(-5, 10)) +
    labs(title = "negative binomial"),
  
  ncol = 2)

data.frame(actual = t_data_for_modeling$feedback,
       resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback, 
       resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>% 
  gather(key = method, value = residual, resi_lm, resi_nb) %>% 
  ggplot(aes(x = residual, colour = method)) +
  geom_line(stat = "density") +
  geom_vline(xintercept = 0, alpha = 0.3)

data.frame(actual = t_data_for_modeling$feedback,
       resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback, 
       resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>% 
  gather(key = method, value = residual, resi_lm, resi_nb) %>% 
  ggplot(aes(x = residual, colour = method)) +
  geom_line(stat = "density") +
  xlim(c(-5, 5)) +
  geom_vline(xintercept = 0, alpha = 0.3)

Text Analysis of Women’s E-Commerce Clothing Reviews

2018-11-12