https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
アパレルECサイトの商品レビューデータから、どのような投稿をすればより多くの“Positive Feedback”を獲得できるのかを分析する。
“Positive Feedback”とは、投稿したレビューが他のユーザーから「有益である」と判定された時に獲得できる。
library(dplyr)
library(tidyverse)
library(tidytext)
library(stringr)
library(ggplot2)
library(tidyr)
library(scales)
library(wordcloud)
library(reshape2)
library(igraph)
library(ggraph)
library(widyr)
library(purrr)
library(psych)
library(MASS)
library(SnowballC)
library(glmmML)
library(topicmodels)
library(ldatuning)
library(rpart)
library(partykit)
library(randomForest)
library(wordcloud)
library(wordcloud2)
library(ggrepel)
library(car)
library(broom)
library(pscl)
library(mpath)
library(webshot)
library(htmlwidgets)
library(DT)
library(htmltools)
library(gridExtra)
library(knitr)
library(devtools)
library(makedummies)
webshot::install_phantomjs()
a_row <- read.csv("Womens Clothing E-Commerce Reviews.csv",
stringsAsFactors = FALSE, sep = ",")
b_data <- a_row %>%
rename(no = X,
item_id = Clothing.ID,
age = Age,
title = Title,
review = Review.Text,
rate = Rating,
recommend = Recommended.IND,
feedback = Positive.Feedback.Count,
division = Division.Name,
department = Department.Name,
class = Class.Name)
b_data$no <- as.factor(b_data$no)
b_data$item_id <- as.factor(b_data$item_id)
b_data %>%
head(2) %>%
DT::datatable()
str(b_data)
## 'data.frame': 23486 obs. of 11 variables:
## $ no : Factor w/ 23486 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ item_id : Factor w/ 1206 levels "0","1","2","3",..: 768 1081 1078 1050 848 1081 859 859 1078 1078 ...
## $ age : int 33 34 60 50 47 49 39 39 24 34 ...
## $ title : chr "" "" "Some major design flaws" "My favorite buy!" ...
## $ review : chr "Absolutely wonderful - silky and sexy and comfortable" "Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have"| __truncated__ "I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small "| __truncated__ "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!" ...
## $ rate : int 4 5 3 5 5 2 5 4 5 5 ...
## $ recommend : int 1 1 0 1 1 0 1 1 1 1 ...
## $ feedback : int 0 4 0 0 6 4 1 4 0 0 ...
## $ division : chr "Initmates" "General" "General" "General Petite" ...
## $ department: chr "Intimate" "Dresses" "Dresses" "Bottoms" ...
## $ class : chr "Intimates" "Dresses" "Dresses" "Pants" ...
summary(b_data %>% dplyr::select(-c(no, item_id)))
## age title review rate
## Min. :18.0 Length:23486 Length:23486 Min. :1.000
## 1st Qu.:34.0 Class :character Class :character 1st Qu.:4.000
## Median :41.0 Mode :character Mode :character Median :5.000
## Mean :43.2 Mean :4.196
## 3rd Qu.:52.0 3rd Qu.:5.000
## Max. :99.0 Max. :5.000
## recommend feedback division department
## Min. :0.0000 Min. : 0.000 Length:23486 Length:23486
## 1st Qu.:1.0000 1st Qu.: 0.000 Class :character Class :character
## Median :1.0000 Median : 1.000 Mode :character Mode :character
## Mean :0.8224 Mean : 2.536
## 3rd Qu.:1.0000 3rd Qu.: 3.000
## Max. :1.0000 Max. :122.000
## class
## Length:23486
## Class :character
## Mode :character
##
##
##
item_idの分布を確認する。
b_data %>%
count(item_id) %>%
ggplot(aes(x = n)) +
geom_histogram()
1つしかないものが多い。
100以上のもののみ抽出し、説明変数に入れても良いが、今回はこの後item_idそのものを除外する。
最も多く出現したitem_idを抽出。
b_data %>%
semi_join(b_data %>%
count(item_id) %>%
top_n(1, n), by = "item_id") %>%
summarise(item_id = first(item_id),
rate_avg = mean(rate),
recommend_avg = mean(recommend),
feedback_avg = mean(feedback))
ひとつのitem_idに対して複数のdivision,department,classが振られていないか確認する。
最初にカテゴリーの内容確認。
b_data %>%
dplyr::select(class) %>%
unique() %>%
filter(class != "") %>%
mutate(row_num = row_number()) %>%
left_join(b_data %>%
dplyr::select(department) %>%
unique() %>%
filter(department != "") %>%
mutate(row_num = row_number()),
by = "row_num") %>%
left_join(b_data %>%
dplyr::select(division) %>%
unique() %>%
filter(division != "") %>%
mutate(row_num = row_number()),
by = "row_num") %>%
dplyr::select(-row_num) %>%
arrange(division, department, class) %>%
mutate_all(funs(replace(., is.na(.), ""))) %>%
dplyr::select(division, department, class) %>%
datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))
n_distinct(b_data$item_id)
## [1] 1206
n_distinct(b_data$item_id, b_data$division)
## [1] 1402
n_distinct(b_data$item_id, b_data$department)
## [1] 1206
n_distinct(b_data$item_id, b_data$class)
## [1] 1207
division,classを加えるとユニークにならない。
classがユニークになっていないitem_idを確認する。
b_data %>%
count(item_id, class) %>%
count(item_id) %>%
filter(nn > 1) %>%
head() %>%
data.frame()
b_data %>%
filter(item_id == 1119) %>%
dplyr::select(item_id, division, department, class) %>%
head()
原因は不明。ユニークになっているdepartmentのみを使用し、divisionとclassは削除。
b_data <- b_data %>%
dplyr::select(-division, -class)
NAの数を確認。
b_data %>%
sapply(is.na) %>%
colSums()
## no item_id age title review rate
## 0 0 0 0 0 0
## recommend feedback department
## 0 0 0
空欄の数を確認。
b_data %>%
sapply(function(x){
x == ""
}) %>%
colSums()
## no item_id age title review rate
## 0 0 0 3810 845 0
## recommend feedback department
## 0 0 14
titleが空欄の影響。
b_data %>%
group_by(title == "") %>%
summarise(mean_fb = mean(feedback),
median_fb = median(feedback))
b_data %>%
mutate(title_blank = ifelse(title == "", "blank", "filled")) %>%
group_by(title_blank) %>%
mutate(title_blank_n = n()) %>%
ungroup() %>%
count(feedback, title_blank_n, title_blank) %>%
mutate(ratio = n / title_blank_n) %>%
ggplot(aes(x = feedback, y = ratio, fill = title_blank)) +
geom_col(position = "dodge", alpha = 0.7, width = 1) +
xlim(-1, 20)
reviewの空欄。
b_data %>%
mutate(review_blank = ifelse(review == "", "blank", "filled")) %>%
group_by(review_blank) %>%
mutate(review_blank_n = n()) %>%
ungroup() %>%
count(feedback, review_blank_n, review_blank) %>%
mutate(ratio = n / review_blank_n) %>%
dplyr::select(feedback, ratio, review_blank) %>%
ggplot(aes(x = feedback, y = ratio, fill = review_blank)) +
geom_col(position = "dodge", alpha = 0.7, width = 1) +
xlim(-1, 20)
reviewが空欄の場合、feedbackが付かないので除外する。
b_data %>%
filter(review == "") %>%
count(feedback)
b_data <- b_data %>%
filter(review != "")
b_data %>%
sapply(function(b_data){
b_data == ""
}) %>%
colSums()
## no item_id age title review rate
## 0 0 0 2966 0 0
## recommend feedback department
## 0 0 13
departmentの空欄について確認する。
b_data %>%
filter(department == "")
item_idでdepartmentを特定できないか検証。
b_data %>%
semi_join(b_data %>%
filter(department == ""),
by = "item_id") %>%
dplyr::select(item_id, department) %>%
filter(department != "")
departmentが空欄のitem_idは、全てdepartmentが空欄であるので、特定はできない。
departmentの空欄は“blank”を代入。
b_data$department[b_data$department == ""] <- "blank"
b_data %>%
sapply(function(b_data){
b_data == ""
}) %>%
colSums()
## no item_id age title review rate
## 0 0 0 2966 0 0
## recommend feedback department
## 0 0 0
b_data %>%
str()
## 'data.frame': 22641 obs. of 9 variables:
## $ no : Factor w/ 23486 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ item_id : Factor w/ 1206 levels "0","1","2","3",..: 768 1081 1078 1050 848 1081 859 859 1078 1078 ...
## $ age : int 33 34 60 50 47 49 39 39 24 34 ...
## $ title : chr "" "" "Some major design flaws" "My favorite buy!" ...
## $ review : chr "Absolutely wonderful - silky and sexy and comfortable" "Love this dress! it's sooo pretty. i happened to find it in a store, and i'm glad i did bc i never would have"| __truncated__ "I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small "| __truncated__ "I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!" ...
## $ rate : int 4 5 3 5 5 2 5 4 5 5 ...
## $ recommend : int 1 1 0 1 1 0 1 1 1 1 ...
## $ feedback : int 0 4 0 0 6 4 1 4 0 0 ...
## $ department: chr "Intimate" "Dresses" "Dresses" "Bottoms" ...
cor(b_data[, which(sapply(b_data, is.integer))],
use = "pairwise.complete.obs")
## age rate recommend feedback
## age 1.00000000 0.02996163 0.03420786 0.04085044
## rate 0.02996163 1.00000000 0.79256807 -0.06098444
## recommend 0.03420786 0.79256807 1.00000000 -0.06592336
## feedback 0.04085044 -0.06098444 -0.06592336 1.00000000
pairs.panels(b_data[, which(sapply(b_data, is.integer))],
lm = TRUE, ellipses = FALSE, stars = TRUE, density = FALSE)
多重共線性を考慮するとrecommendを除外するべきか?
feedbackとの間に線形関係は見られない。
線形回帰は難しいか。
b_data %>%
count(feedback) %>%
ggplot(aes(x = feedback, y = n)) +
geom_col(width = 1)
feedback40以下に絞る。
b_data %>%
count(feedback) %>%
ggplot(aes(x = feedback, y = n)) +
geom_col(width = 1) +
xlim(-1, 40)
describe(b_data$feedback)
c_token <- b_data %>%
unnest_tokens(word, review)
c_token %>%
filter(no == 0) %>%
dplyr::select(word) %>%
mutate(row_num = row_number()) %>%
left_join(b_data %>%
filter(no == 0) %>%
dplyr::select(review) %>%
mutate(row_num = 1),
by = "row_num") %>%
dplyr::select(review, word) %>%
dplyr:::rename(before_token = review,
after_token = word) %>%
datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))
stemmingで変化した単語のうち数の多いもの。
c_token %>%
mutate(word_stem = wordStem(word, language = "en")) %>%
filter(word != word_stem) %>%
group_by(word_stem) %>%
mutate(word_stem_n = n()) %>%
ungroup() %>%
group_by(word, word_stem, word_stem_n) %>%
summarise(word_n = n()) %>%
dplyr::select(word, word_n, word_stem, word_stem_n) %>%
arrange(desc(word_stem_n)) %>%
head(20) %>%
datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))
stopwordsを除外した後のstemming上位。
c_token %>%
anti_join(stop_words) %>%
mutate(word_stem = wordStem(word, language = "en")) %>%
filter(word != word_stem) %>%
group_by(word_stem) %>%
mutate(word_stem_n = n()) %>%
ungroup() %>%
group_by(word, word_stem, word_stem_n) %>%
summarise(word_n = n()) %>%
dplyr::select(word, word_n, word_stem, word_stem_n) %>%
arrange(desc(word_stem_n)) %>%
head(20) %>%
datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 20))
c_token <- c_token %>%
mutate(word_stem = wordStem(word, language = "en")) %>%
rename(word_ori = word,
word = word_stem)
感情辞書の中身を確認。
bind_cols(get_sentiments("afinn") %>% head(10) %>% dplyr::rename(word_afinn = word,
score_afinn = score),
get_sentiments("bing") %>% head(10) %>% dplyr::rename(word_bing = word,
sent_bing = sentiment),
get_sentiments("nrc") %>% head(10) %>% dplyr::rename(word_nrc = word,
sent_nrc = sentiment)) %>%
DT::datatable(rownames = FALSE)
各辞書を今回のデータに紐付けた場合の上位。
bind_rows(c_token %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
mutate(sentiment = case_when(score > 0 ~ "positive",
score < 0 ~ "negative"),
method = "afinn") %>%
filter(!is.na(sentiment)),
c_token %>%
inner_join(get_sentiments("bing"), by = "word") %>%
mutate(method = "bing"),
c_token %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
mutate(method = "nrc")
) %>%
count(word, sentiment, method) %>%
group_by(sentiment, method) %>%
top_n(5) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(sentiment ~ method, scales = "free") +
coord_flip()
各辞書を今回のデータに紐付けた場合のposinegaの単語数比較。
bind_rows(c_token %>%
left_join(get_sentiments("afinn"), by = "word") %>%
group_by(no) %>%
summarise(score = sum(score, na.rm = TRUE)) %>%
mutate(method = "afinn"),
c_token %>%
left_join(get_sentiments("bing"), by = "word") %>%
count(no, sentiment) %>%
mutate(n = case_when(sentiment == "positive" ~ n,
sentiment == "negative" ~ -n)) %>%
group_by(no) %>%
summarise(score = sum(n, na.rm = TRUE)) %>%
mutate(method = "bing"),
c_token %>%
left_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")),
by = "word") %>%
count(no, sentiment) %>%
mutate(n = case_when(sentiment == "positive" ~ n,
sentiment == "negative" ~ -n)) %>%
group_by(no) %>%
summarise(score = sum(n, na.rm = TRUE)) %>%
mutate(method = "nrc")
) %>%
mutate(score_colour = case_when(score < 0 ~ "a",
score == 0 ~ "b",
score > 0 ~ "c")) %>%
ggplot(aes(x = score, fill = method)) + # , fill = score_colour
# geom_histogram(binwidth = 1, show.legend = FALSE) +
stat_density(bw = 1) +
facet_wrap( ~ method, scales = "free") +
geom_vline(xintercept = 0) +
scale_y_continuous(labels = percent_format())
bind_rows(
get_sentiments("afinn") %>%
count() %>%
mutate(method = "afinn"),
get_sentiments("bing") %>%
count() %>%
mutate(method = "bing"),
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))%>%
count() %>%
mutate(method = "nrc")
) %>%
dplyr::select(method, n)
感情辞書afinnの中身を確認。
c_token %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
count(word, score) %>%
group_by(score) %>%
top_n(5, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = score)) +
geom_col(show.legend = FALSE) +
facet_wrap( ~ score, scales = "free", ncol = 3) +
coord_flip()
今回は感情の項目がposinegaのみと、単純なbingを使用する。
c_token <- c_token %>%
left_join(get_sentiments("bing"), by = "word")
bingを適用してみて、posinegaの数が多いものを確認。
c_token %>%
filter(!is.na(sentiment)) %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ sentiment, scales = "free")
不適切な単語を除外する。
d_typical_sent <- b_data %>%
dplyr::select(no, review) %>%
unnest_tokens(review, review, token = "regex", pattern = "\\.") %>%
filter(str_detect(review, c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark")))
c_token <- c_token %>%
mutate(sentiment = if_else(word %in% c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark"), as.character(NA), sentiment))
除外後に再度感情の上位を確認。
c_token %>%
filter(!is.na(sentiment)) %>%
count(word, sentiment) %>%
group_by(sentiment) %>%
top_n(10, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ sentiment, scales = "free")
否定語の影響を考慮する。
e_negation <- b_data %>%
dplyr::select(no, review) %>%
unnest_tokens(bigrams, review, token = "ngrams", n = 2) %>%
separate(bigrams, c("word1", "word2"), sep = " ") %>%
mutate(word1 = wordStem(word1, language = "en"),
word2 = wordStem(word2, language = "en")) %>%
filter((word1 %in% c("no", "not", "never", "without")) |
str_detect(word1, "n't")) %>%
inner_join(get_sentiments("bing"), by = c("word2" = "word")) %>%
filter(!word2 %in% c("fall", "worn", "flare", "top", "bust", "tank", "hang", "dark"))
e_negation %>%
count(word1, sort = TRUE) %>%
head(10) %>%
DT::datatable(rownames = FALSE)
e_negation %>%
count(word2, sort = TRUE) %>%
head(10)
否定語の後に使われている単語ランキング。
e_negation %>%
count(word2, sentiment) %>%
top_n(20, n) %>%
mutate(word = reorder(word2, n)) %>%
ggplot(aes(x = word, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
coord_flip() +
facet_wrap(~ sentiment, scales = "free")
否定語の直後に出現している単語の感情スコアの正負を逆転して再計算する。
f_negation_smr <- e_negation %>%
count(no, sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>%
dplyr::rename(positive_inv = positive,
negative_inv = negative)
g_sent_smr <- c_token %>%
count(no, feedback, rate, sentiment) %>%
spread(key = sentiment, value = n, fill = 0) %>%
dplyr::rename(neutral = "<NA>") %>%
left_join(f_negation_smr, by = "no") %>%
# mutate_all(funs(if_else(is.na(.), as.integer(0), as.integer(.)))) %>%
mutate_all(funs(replace(., is.na(.), 0))) %>%
mutate(posi_total = positive - positive_inv + negative_inv,
nega_total = negative - negative_inv + positive_inv,
sent_total = posi_total - nega_total)
否定語を考慮した後の感情スコア分布。
g_sent_smr %>%
mutate(sent_colour = case_when(
sent_total < 0 ~ "a",
sent_total == 0 ~ "b",
sent_total > 0 ~ "c"
)) %>%
ggplot(aes(x = sent_total, fill = sent_colour)) +
geom_histogram(show.legend = FALSE, binwidth = 1)
最も多くpositive wordが使われているreview。
b_data %>%
inner_join(g_sent_smr %>%
top_n(1, sent_total) %>%
mutate(no = as.factor(no)),
by = "no") %>%
dplyr::select(review, sent_total)
最もnegative。
b_data %>%
inner_join(g_sent_smr %>%
top_n(-1, sent_total) %>%
mutate(no = as.factor(no)),
by = "no") %>%
dplyr::select(review, sent_total) %>%
DT::datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 1))
c_token %>%
inner_join(g_sent_smr %>%
top_n(1, desc(sent_total)),
by = "no") %>%
filter(!is.na(sentiment)) %>%
dplyr::select(no, word, sentiment) %>%
DT::datatable(class = "cell-border stripe", rownames = FALSE, options = list(pageLength = 7))
embroidery:かぎ裂きg_sent_smr %>%
ggplot(aes(sent_total, feedback)) +
geom_point() +
#geom_abline(colour = "red")
geom_smooth(method = "lm", colour = "red")
cor(g_sent_smr$feedback, g_sent_smr$sent_total)
## [1] -0.003327681
絶対値であればそう感が強く出るか、確認。
g_sent_smr %>%
ggplot(aes(abs(sent_total), feedback)) +
geom_point() +
#geom_abline(colour = "red")
geom_smooth(method = "lm", colour = "red")
cor(g_sent_smr$feedback, abs(g_sent_smr$sent_total))
## [1] 0.01046473
絶対値の方が相関は強く出た。しかし、今回はモデルには使用していない。
rateとsent_totalの関係確認。
g_sent_smr %>%
ggplot(aes(as.factor(rate), sent_total)) +
geom_boxplot() +
geom_abline(colour = "red") +
labs(x = "rate")
cor(g_sent_smr$rate, g_sent_smr$sent_total)
## [1] 0.4009777
h_data_for_analysis <- b_data %>%
left_join(g_sent_smr %>%
dplyr::select(no, posi_total, nega_total, sent_total),
by = "no") %>%
dplyr::select(-c(item_id, review)) %>%
mutate(title = if_else(title == "", 0, 1),
department = as.factor(department),
recommend = as.factor(recommend)) %>%
dplyr::rename(title_filled = title) %>%
inner_join(c_token %>%
count(no) %>%
dplyr::rename(word_cnt = n),
by = "no")
posinegaを比率に変換し、feedbackとの線形関係を確認。
h_data_for_analysis %>%
mutate(posi_ratio = posi_total / word_cnt,
nega_ratio = nega_total / word_cnt) %>%
gather(key = sent, value = ratio, posi_ratio, nega_ratio) %>%
ggplot(aes(ratio, feedback, fill = sent)) +
geom_point(show.legend = FALSE) +
stat_smooth(aes(group = sent, colour = sent), method = "lm", se = FALSE, fullrange = TRUE, show.legend = FALSE) +
facet_wrap( ~ sent, scales = "free")
i_dtm <- c_token %>%
filter(!str_detect(word, "\\d")) %>%
anti_join(stop_words, by = "word") %>%
mutate(no = as.factor(no)) %>%
count(no, word) %>%
cast_dtm(no, word, n)
j_for_topic <- i_dtm[apply(i_dtm, 1, sum) > 0, ]
適切なトピック数を推定。
# system.time(k_topic_number <- FindTopicsNumber(
# j_for_topic,
# topics = seq(from = 2, to = 10, by = 1),
# metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
# method = "Gibbs",
# control = list(seed = 2018),
# mc.cores = 2L,
# verbose = TRUE
# ))
| user | system | elapsed |
|---|---|---|
| 1.905 | 1.388 | 551.116 |
FindTopicsNumber_plot(k_topic_number)
Griffiths2004……周辺尤度
CaoJuan2009……cos類似度
Arun2010……距離?
Deveaud2014……KLダイバージェンス
解釈性も考慮し、トピック数は4で進める。
l_lda <- LDA(j_for_topic, k = 4, control = list(seed = 2018))
γ値によってトピックによる分類度合いを確認。
γ値とは文書の構成単語の何割がそのトピックから発生したかを示す(?)
m_gamma <- l_lda %>%
tidy(matrix = "gamma") %>%
mutate(document = factor(document, levels = unique(c_token$no)))
m_gamma %>%
ggplot(aes(gamma)) +
geom_histogram() +
scale_y_log10() +
labs(x = expression(gamma))
あまり分類されていない。
m_gamma %>%
ggplot(aes(gamma, fill = factor(topic))) +
geom_histogram() +
facet_wrap( ~ topic) +
scale_y_log10() +
labs(x = expression(gamma))
どの基準で別れたのか決定木で確認。
n_for_tree <- h_data_for_analysis %>%
left_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(),
by = c("no" = "document")) %>%
dplyr::select(-gamma, -no) %>%
mutate(topic = factor(topic))
o_tree_res <- rpart(topic ~ ., data = n_for_tree,
method = "class", cp = 0.005)
plot(as.party(o_tree_res))
departmentによる分岐が強い?
department別にγ値を確認。
m_gamma %>%
left_join(h_data_for_analysis, by = c("document" = "no")) %>%
mutate(department = reorder(department, gamma * topic)) %>%
ggplot(aes(factor(topic), gamma)) +
geom_boxplot() +
facet_wrap( ~ department, nrow = 2) +
labs(x = "topic number", y = expression(gamma))
あまり分かれていない。
p_beta <- l_lda %>%
tidy(matrix = "beta")
各トピックのβ値上位20単語。
β値とは単語分布のパラメータ(?)
トピックの解釈を試みる。
p_beta %>%
anti_join(stop_words, by = c("term" = "word")) %>%
group_by(topic) %>%
top_n(15, beta) %>%
ungroup() %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
mutate(term = reorder(str_c(term, topic, sep = "__"), beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap( ~ topic, scales = "free") +
coord_flip() +
scale_x_discrete(labels = function(x){
gsub("__.+$", "", x)
}) +
labs(x = NULL, y = expression(beta))
stop_wordsを除外した上で、各トピックβ値上位20単語のβ値を比較し、トピックの解釈を行う。
p_beta %>%
anti_join(stop_words, by = c("term" = "word")) %>%
semi_join(p_beta %>%
anti_join(stop_words, by = c("term" = "word")) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup(), by = "term") %>%
group_by(topic) %>%
mutate(term = reorder(term, desc(beta))) %>%
ggplot(aes(x = factor(topic), y = beta)) +
geom_bar(stat = "identity", show.legend = FALSE, aes(fill = topic)) +
facet_wrap( ~ term, nrow = 2) +
labs(x = "topic", y = expression(beta))
解釈の難しい1,4のみ抽出して比較。
p_beta %>%
anti_join(stop_words, by = c("term" = "word")) %>%
semi_join(p_beta %>%
anti_join(stop_words, by = c("term" = "word")) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup(), by = "term") %>%
group_by(topic) %>%
mutate(term = reorder(term, desc(beta))) %>%
filter(topic %in% c("1", "4")) %>%
ggplot(aes(x = factor(topic), y = beta)) +
geom_bar(stat = "identity", show.legend = FALSE, aes(fill = topic)) +
facet_wrap( ~ term, nrow = 2) +
labs(x = "topic", y = expression(beta))
topic1–trialの値が高い。
topic2–top,skirt,sweaterとnegativeな単語の値が高い。
topic3–onlineとsizeの値が高い。
topic4–dressとpositiveな単語の値が高い。
以下、解釈がずれていないか確認を行う。
q_topic_data <- b_data %>%
inner_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(), by = c("no" = "document")) %>%
mutate(topic = topic %>% factor()) %>%
filter(!is.na(topic))
トピックごとのデータ概観。
q_topic_data %>%
group_by(topic) %>%
summarise(age_avg = mean(age) %>% round(2),
rate_avg = mean(rate) %>% round(2),
feedback_avg = mean(feedback) %>% round(2)) %>%
DT::datatable(rownames = FALSE)
love, veri, perfectの単語のβ値が高かったtopic1,4はrateが高い?
q_topic_data %>%
group_by(topic) %>%
summarise(rate_avg = mean(rate, na.rm = TRUE),
rate_std = sd(rate, na.rm = TRUE))
若干だが平均が高く出ている。
dressのβ値が高いtopic4はdepartmentがdressの割合が高いか?
q_topic_data %>%
count(topic, department) %>%
group_by(topic) %>%
mutate(ratio = n / sum(n),
department = reorder(department, ratio)) %>%
ungroup() %>%
ggplot(aes(x = topic, y = ratio, fill = department)) +
geom_bar(stat = "identity", position = "fill") +
scale_y_continuous(labels = percent) +
geom_text_repel(aes(label = percent(ratio)), position = position_stack(vjust = 0.5), point.padding = 0, segment.size = 0)
高く出ている。
sizeの値が高いtopic3は、数字が多いか?
c_token %>%
inner_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(), by = c("no" = "document")) %>%
mutate(digit_bool = str_detect(word, "\\d")) %>%
group_by(topic) %>%
mutate(word_cnt = n()) %>%
ungroup() %>%
group_by(topic) %>%
summarise(digit_cnt = sum(digit_bool, na.rm = TRUE),
digit_ratio = sum(digit_bool, na.rm = TRUE) / first(word_cnt)) %>%
DT::datatable(rownames = FALSE)
若干高い。topic4も高く出ている。
tf_idfによるトピック解釈.
c_token %>%
filter(!str_detect(word, "\\d")) %>%
left_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(), by = c("no" = "document")) %>%
count(topic, word) %>%
filter(n >= 30) %>%
bind_tf_idf(word, topic, n) %>%
group_by(topic) %>%
top_n(10, tf_idf) %>%
ungroup() %>%
mutate(word = reorder(str_c(word, topic, sep = "__"), tf_idf)) %>%
ggplot(aes(word, tf_idf, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap( ~ topic, scales = "free") +
coord_flip() +
scale_x_discrete(labels = function(x){
gsub("__.+$", "", x)
})
tf_idfによるは解釈が困難。
トピック別に単語ネットワークを作成し解釈を試みる。
for(i in 1:4){
set.seed(2018)
xxx <- c_token %>%
anti_join(stop_words, by = "word") %>%
filter(!str_detect(word, "\\d")) %>%
left_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(), by = c("no" = "document")) %>%
filter(topic == i) %>%
count(no, word) %>%
filter(n >= 5) %>%
ungroup() %>%
pairwise_cor(word, no, sort = TRUE) %>%
filter(!near(correlation, 1)) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = correlation, width = correlation), colour = "lightgray") +
scale_edge_width(range = c(0.1, 1)) +
geom_node_point(size = 3, colour = "darkblue") +
geom_node_text(aes(label = name), repel = TRUE) +
labs(tite = str_c("topic_", i, sep = ""))
plot(xxx)
}
set.seed(2018)
my_cloud1 <- c_token %>%
anti_join(stop_words, by = "word") %>%
filter(!str_detect(word, "\\d"),
!str_detect(word,"_")) %>%
left_join(m_gamma %>%
group_by(document) %>%
top_n(1, gamma) %>%
ungroup(), by = c("no" = "document")
) %>%
filter(topic == 1) %>%
count(word) %>%
filter(n >= 50) %>%
wordcloud2(size = 10, gridSize = 5)
saveWidget(my_cloud1, file = "tmp1.html", selfcontained = FALSE)
img1 <- webshot::webshot("tmp1.html", file = "wc1.png", delay = 5, vwidth = 1000, vheight = 700)
tags$a(href = "tmp1.html",
tags$img(src = img1, alt = ""))
モデリング用のデータを作成。
posinegaのratioなど。
h_data_for_analysis <- h_data_for_analysis %>%
left_join(m_gamma %>%
mutate(topic = str_c("topic_", topic)) %>%
spread(key = topic, value = gamma, fill = 0),
by = c("no" = "document"))
t_data_for_modeling <- h_data_for_analysis %>%
mutate(posi_ratio = posi_total / word_cnt,
nega_ratio = nega_total / word_cnt) %>%
dplyr::select(-c("no", "posi_total", "nega_total", "sent_total", "topic_4")) %>%
mutate(department = department %>% as.factor()) %>%
makedummies() %>%
filter(complete.cases(.) == TRUE)
平均と標準偏差を確認。
u_mean <- mean(t_data_for_modeling$feedback, na.rm = TRUE)
v_var <- var(t_data_for_modeling$feedback, na.rm = TRUE)
cat("mean", u_mean, "\nvar", v_var)
## mean 2.630654
## var 33.48105
得られた数値を元に確率分布を描画。
t_data_for_modeling %>%
dplyr::select(feedback) %>%
mutate(data = "actual") %>%
dplyr::rename(cnt = feedback) %>%
bind_rows(data_frame(cnt = rnbinom(n = 100000, size = 0.466, mu = u_mean)) %>%
mutate(data = "nbinom"),
data_frame(cnt = rnorm(n = 100000, mean = u_mean, sd = sqrt(v_var))) %>%
mutate(data = "norm")) %>%
ggplot(aes(cnt, colour = data, linetype = data)) +
geom_line(stat = "density", adjust = 2) +
scale_linetype_manual(values = c("solid", "dashed", "longdash"))
分布の確認。
t_data_for_modeling %>%
dplyr::select(feedback) %>%
mutate(data = "actual") %>%
dplyr::rename(cnt = feedback) %>%
bind_rows(data_frame(cnt = rnorm(100000, mean = u_mean, sd = sqrt(v_var))) %>%
mutate(data = "norm")) %>%
ggplot(aes(cnt, colour = data, linetype = data)) +
geom_line(stat = "density", adjust = 2) +
scale_linetype_manual(values = c("solid", "dashed"))
線形関係の確認。
cor(t_data_for_modeling %>%
dplyr::select(-feedback, feedback)) %>%
cor.plot(xlas = 2)
cor(t_data_for_modeling %>%
dplyr::select(-feedback, feedback)) %>%
round(1) %>%
DT::datatable(options = list(pageLength = 17))
t_data_for_modeling <- t_data_for_modeling %>%
dplyr::select(-recommend)
w_lm <- lm(feedback ~ ., data = t_data_for_modeling)
summary(w_lm)
##
## Call:
## lm(formula = feedback ~ ., data = t_data_for_modeling)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.826 -2.497 -1.321 0.316 117.667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.462875 8.553998 -0.288 0.773410
## age 0.019248 0.003067 6.275 3.55e-10 ***
## title_filled -0.398685 0.113297 -3.519 0.000434 ***
## rate -0.239593 0.037512 -6.387 1.72e-10 ***
## department_Bottoms 0.278348 1.574529 0.177 0.859682
## department_Dresses 1.087393 1.573744 0.691 0.489598
## department_Intimate 0.290926 1.577643 0.184 0.853697
## department_Jackets 0.829547 1.582142 0.524 0.600062
## department_Tops 0.675787 1.572722 0.430 0.667423
## department_Trend 1.032841 1.657329 0.623 0.533162
## word_cnt 0.035001 0.001502 23.306 < 2e-16 ***
## topic_1 11.137219 13.013385 0.856 0.392102
## topic_2 5.324439 12.505847 0.426 0.670290
## topic_3 -4.430701 13.340933 -0.332 0.739807
## posi_ratio -2.708613 0.858376 -3.156 0.001604 **
## nega_ratio -3.368905 2.446524 -1.377 0.168521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.663 on 22624 degrees of freedom
## Multiple R-squared: 0.0427, Adjusted R-squared: 0.04207
## F-statistic: 67.28 on 15 and 22624 DF, p-value: < 2.2e-16
x_lm_step <- stepAIC(w_lm)
summary(x_lm_step)
##
## Call:
## lm(formula = feedback ~ age + title_filled + rate + department_Dresses +
## department_Jackets + department_Tops + word_cnt + posi_ratio,
## data = t_data_for_modeling)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.678 -2.500 -1.322 0.310 117.778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.730399 0.244129 2.992 0.00278 **
## age 0.019155 0.003064 6.252 4.13e-10 ***
## title_filled -0.395565 0.113263 -3.492 0.00048 ***
## rate -0.225125 0.035601 -6.323 2.61e-10 ***
## department_Dresses 0.794062 0.105750 7.509 6.19e-14 ***
## department_Jackets 0.522331 0.194861 2.681 0.00736 **
## department_Tops 0.381168 0.095546 3.989 6.65e-05 ***
## word_cnt 0.035096 0.001495 23.471 < 2e-16 ***
## posi_ratio -2.556394 0.850877 -3.004 0.00266 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.663 on 22631 degrees of freedom
## Multiple R-squared: 0.04245, Adjusted R-squared: 0.04211
## F-statistic: 125.4 on 8 and 22631 DF, p-value: < 2.2e-16
car::vif(x_lm_step)
## age title_filled rate
## 1.006958 1.030972 1.113887
## department_Dresses department_Jackets department_Tops
## 1.561154 1.133824 1.590773
## word_cnt posi_ratio
## 1.298906 1.383510
多重共線性は見られない。
回帰診断図。
par(mfrow = c(2, 2))
plot(x_lm_step)
par(mfrow = c(1, 1))
等分散性、独立性、正規性に問題?
t_data_for_modeling %>%
dplyr::select(feedback) %>%
rename(cnt = feedback) %>%
mutate(data = "actual") %>%
bind_rows(data_frame(cnt = rnbinom(100000, size = 0.466, mu = u_mean)) %>%
mutate(data = "nb")) %>%
ggplot(aes(cnt, colour = data, linetype = data)) +
geom_line(stat = "density", adjust = 2) +
scale_linetype_manual(values = c("solid", "dashed"))
z_nb <- glm.nb(feedback ~ . ,data = t_data_for_modeling)
summary(z_nb)
##
## Call:
## glm.nb(formula = feedback ~ ., data = t_data_for_modeling, init.theta = 0.397347153,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5712 -1.1834 -0.5919 0.0616 5.6829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1263093 2.5934486 -0.434 0.6641
## age 0.0077052 0.0009325 8.263 < 2e-16 ***
## title_filled -0.1660607 0.0346953 -4.786 1.70e-06 ***
## rate -0.0887924 0.0113392 -7.831 4.86e-15 ***
## department_Bottoms 0.9104660 0.6089563 1.495 0.1349
## department_Dresses 1.2275413 0.6087359 2.017 0.0437 *
## department_Intimate 0.8818906 0.6097637 1.446 0.1481
## department_Jackets 1.0988041 0.6107269 1.799 0.0720 .
## department_Tops 1.0566013 0.6085018 1.736 0.0825 .
## department_Trend 1.2244363 0.6280347 1.950 0.0512 .
## word_cnt 0.0138863 0.0004593 30.236 < 2e-16 ***
## topic_1 3.4105958 3.8972303 0.875 0.3815
## topic_2 -0.5400880 3.7509793 -0.144 0.8855
## topic_3 -1.2775170 4.0004741 -0.319 0.7495
## posi_ratio -2.0547557 0.2816983 -7.294 3.01e-13 ***
## nega_ratio -0.9477514 0.7546057 -1.256 0.2091
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.3973) family taken to be 1)
##
## Null deviance: 23454 on 22639 degrees of freedom
## Residual deviance: 21650 on 22624 degrees of freedom
## AIC: 88169
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.39735
## Std. Err.: 0.00526
##
## 2 x log-likelihood: -88135.38400
過分散検定。
odTest(z_nb)
## Likelihood ratio test of H0: Poisson, as restricted NB model:
## n.b., the distribution of the test-statistic under H0 is non-standard
## e.g., see help(odTest) for details/references
##
## Critical value of test statistic at the alpha= 0.05 level: 2.7055
## Chi-Square Test Statistic = 78260.8141 p-value = < 2.2e-16
有意と出たのでポアソン回帰より、負の二項回帰が適切。
za_nb_step <- stepAIC(z_nb)
summary(za_nb_step)
##
## Call:
## glm.nb(formula = feedback ~ age + title_filled + rate + department_Bottoms +
## department_Dresses + department_Intimate + department_Jackets +
## department_Tops + department_Trend + word_cnt + topic_1 +
## posi_ratio, data = t_data_for_modeling, init.theta = 0.3972941982,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5668 -1.1834 -0.5914 0.0621 5.6956
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7854655 0.9571858 -1.865 0.0621 .
## age 0.0076721 0.0009321 8.231 < 2e-16 ***
## title_filled -0.1644837 0.0346929 -4.741 2.13e-06 ***
## rate -0.0842510 0.0107621 -7.828 4.94e-15 ***
## department_Bottoms 0.9095515 0.6087028 1.494 0.1351
## department_Dresses 1.2280775 0.6084274 2.018 0.0435 *
## department_Intimate 0.8814041 0.6095127 1.446 0.1482
## department_Jackets 1.0971688 0.6104589 1.797 0.0723 .
## department_Tops 1.0558604 0.6082441 1.736 0.0826 .
## department_Trend 1.2240462 0.6277768 1.950 0.0512 .
## word_cnt 0.0139058 0.0004579 30.372 < 2e-16 ***
## topic_1 4.1020691 2.9337066 1.398 0.1620
## posi_ratio -2.0175157 0.2796614 -7.214 5.43e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.3973) family taken to be 1)
##
## Null deviance: 23451 on 22639 degrees of freedom
## Residual deviance: 21649 on 22627 degrees of freedom
## AIC: 88165
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.39729
## Std. Err.: 0.00526
##
## 2 x log-likelihood: -88137.02500
car::vif(za_nb_step)
## age title_filled rate
## 1.008240 1.031887 1.119594
## department_Bottoms department_Dresses department_Intimate
## 373.645470 566.983140 182.465315
## department_Jackets department_Tops department_Trend
## 121.365576 692.331052 16.169450
## word_cnt topic_1 posi_ratio
## 1.285490 1.017052 1.376472
mean absolute error
zb_mae_lm <- mean(abs(t_data_for_modeling$feedback - fitted.values(x_lm_step)))
zc_mae_nb <- mean(abs(t_data_for_modeling$feedback - fitted.values(za_nb_step)))
cat("MAE of lm is ", zb_mae_lm, "\nMAE of nb glm is ", zc_mae_nb)
## MAE of lm is 2.977924
## MAE of nb glm is 2.974029
za_nb_step %>%
tidy() %>%
rename(estimate_nb = estimate,
p.value_nb = p.value) %>%
dplyr::select(term, estimate_nb, p.value_nb) %>%
mutate_if(is.double, funs(round(., digits = 2))) %>%
left_join(x_lm_step %>%
tidy() %>%
rename(estimate_lm = estimate,
p.value_lm = p.value) %>%
dplyr::select(term, estimate_lm, p.value_lm) %>%
mutate_if(is.double, funs(round(., digits = 2))),
by = "term") %>%
dplyr::select(term, estimate_lm, p.value_lm, estimate_nb, p.value_nb) %>%
DT::datatable(options = list(pageLength = 13))
負の二項回帰の方がステップワイズ後に残った変数が多い。
title_filledについて確認。
t_data_for_modeling %>%
mutate(title_filled = title_filled %>% as.factor()) %>%
ggplot(aes(x = title_filled, y = feedback)) +
geom_boxplot() +
scale_y_continuous(limits = c(NA,10))
t_data_for_modeling %>%
group_by(title_filled) %>%
summarise(avg = mean(feedback),
med = median(feedback)) %>%
DT::datatable(rownames = FALSE, options = list(pageLength = 2))
t_data_for_modeling %>%
group_by(title_filled) %>%
do(qtile = quantile(.$feedback, seq(0, 1, by = 0.1))) %>%
cbind(do.call(rbind, .$qtile)) %>%
dplyr::select(-qtile) %>%
DT::datatable(rownames = FALSE, options = list(pageLength = 2))
t_data_for_modeling %>%
mutate(title_filled = title_filled %>% as.factor()) %>%
ggplot(aes(x = title_filled, y = feedback, fill = title_filled)) +
geom_violin() +
scale_y_continuous(limits = c(NA, 10))
data.frame(actual = t_data_for_modeling$feedback,
resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback,
resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>%
gather(key = method, value = residual, resi_lm, resi_nb) %>%
ggplot(aes(x = actual, y = residual, colour = method)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0) +
facet_grid( ~ method)
grid.arrange(
data.frame(actual = t_data_for_modeling$feedback,
predict = fitted.values(x_lm_step)) %>%
round(0) %>%
gather(key = method, value = feedback, actual, predict) %>%
count(method, feedback) %>%
ggplot(aes(x = feedback, y = n, fill = method)) +
geom_col(width = 1, position = "dodge") +
xlim(c(-5, 10)) +
labs(title = "linear"),
data.frame(actual = t_data_for_modeling$feedback,
predict = fitted.values(za_nb_step)) %>%
round(0) %>%
gather(key = method, value = feedback, actual, predict) %>%
count(method, feedback) %>%
ggplot(aes(x = feedback, y = n, fill = method)) +
geom_col(width = 1, position = "dodge") +
xlim(c(-5, 10)) +
labs(title = "negative binomial"),
ncol = 2)
data.frame(actual = t_data_for_modeling$feedback,
resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback,
resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>%
gather(key = method, value = residual, resi_lm, resi_nb) %>%
ggplot(aes(x = residual, colour = method)) +
geom_line(stat = "density") +
geom_vline(xintercept = 0, alpha = 0.3)
data.frame(actual = t_data_for_modeling$feedback,
resi_lm = fitted.values(x_lm_step) - t_data_for_modeling$feedback,
resi_nb = fitted.values(za_nb_step) - t_data_for_modeling$feedback) %>%
gather(key = method, value = residual, resi_lm, resi_nb) %>%
ggplot(aes(x = residual, colour = method)) +
geom_line(stat = "density") +
xlim(c(-5, 5)) +
geom_vline(xintercept = 0, alpha = 0.3)