This report is a summary of lesson by Datacamp
library(tidyverse)
library(tidytext)
library(wordcloud)
review_data <- read_csv("Roomba Reviews.csv")
theme_set(theme_bw())
unnest_tokenscountstop_words: 관사 등 분석에 불필요한 단어 데이터셋으로
불용어라고도 함review_data %>%
unnest_tokens(output = word, Review) %>%
anti_join(stop_words) %>%
count(word) %>%
arrange(desc(n))
## # A tibble: 9,613 × 2
## word n
## <chr> <int>
## 1 roomba 2289
## 2 clean 1205
## 3 vacuum 989
## 4 hair 900
## 5 cleaning 809
## 6 time 795
## 7 house 745
## 8 floors 657
## 9 day 579
## 10 floor 561
## # ℹ 9,603 more rows
row_number: 행번호 반환tidy_review <-
review_data %>%
mutate(id = row_number()) %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words)
word_count <-
tidy_review %>%
count(word) %>%
arrange(desc(n))
word_count %>%
filter(n > 300) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
coord_flip() +
scale_y_discrete(expand = c(0.01,0)) +
ggtitle("Review Word Counts")
tribble()을 통해 사용자 정의 stop_words
정의 가능custom_stop_words <-
tribble(
~word, ~lexicon,
"roomba", "CUSTOM",
"2", "CUSTOM"
)
stop_words2 <- stop_words %>%
bind_rows(custom_stop_words)
fct_reorder() 배열 순서 정의tidy_review <-
review_data %>%
mutate(id = row_number()) %>%
select(id, Date, Product, Stars, Review) %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words2)
word_count <- tidy_review %>%
count(word) %>%
filter(n > 300) %>%
mutate(word2 = fct_reorder(word, n))
ggplot(word_count, aes(x = word2, y = n)) +
geom_col() +
coord_flip() +
ggtitle("Review Word Counts")
slice_max()word_count <- tidy_review %>%
count(word, Product) %>%
# Product 별로 상위 10개 word 출력, 총 20개 반환
group_by(Product) %>%
slice_max(n, n = 10) %>%
# 그룹화 이후 그룹해제도 해주는 것이 좋음
ungroup() %>%
mutate(word2 = fct_reorder(word, n)) %>%
print()
## # A tibble: 20 × 4
## word Product n word2
## <chr> <chr> <int> <fct>
## 1 clean iRobot Roomba 650 for Pets 390 clean
## 2 vacuum iRobot Roomba 650 for Pets 311 vacuum
## 3 hair iRobot Roomba 650 for Pets 305 hair
## 4 time iRobot Roomba 650 for Pets 301 time
## 5 floors iRobot Roomba 650 for Pets 252 floors
## 6 house iRobot Roomba 650 for Pets 251 house
## 7 cleaning iRobot Roomba 650 for Pets 249 cleaning
## 8 day iRobot Roomba 650 for Pets 209 day
## 9 floor iRobot Roomba 650 for Pets 207 floor
## 10 run iRobot Roomba 650 for Pets 180 run
## 11 clean iRobot Roomba 880 for Pets and Allergies 815 clean
## 12 vacuum iRobot Roomba 880 for Pets and Allergies 678 vacuum
## 13 hair iRobot Roomba 880 for Pets and Allergies 595 hair
## 14 cleaning iRobot Roomba 880 for Pets and Allergies 560 cleaning
## 15 880 iRobot Roomba 880 for Pets and Allergies 518 880
## 16 house iRobot Roomba 880 for Pets and Allergies 494 house
## 17 time iRobot Roomba 880 for Pets and Allergies 494 time
## 18 floors iRobot Roomba 880 for Pets and Allergies 405 floors
## 19 love iRobot Roomba 880 for Pets and Allergies 403 love
## 20 dust iRobot Roomba 880 for Pets and Allergies 399 dust
ggplot(word_count, aes(x = word2, y = n, fill = Product)) +
geom_col(show.legend = TRUE) +
facet_wrap(~ Product, scales = "free_y") +
coord_flip() +
ggtitle("Review Word Counts")
wordcloud::wordcloudword_counts <- tidy_review %>%
count(word)
wordcloud(
words = word_counts$word,
freq = word_counts$n,
max.words = 30,
colors = "blue"
)
The most straightforward way to conduct sentiment analysis is to use an existing sentiment lexicon or dictionary.
# bing 사전 출력
get_sentiments("bing")
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
library(textdata)
get_sentiments("afinn")
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
get_sentiments("afinn") %>%
summarize(
min = min(value),
max = max(value)
)
## # A tibble: 1 × 2
## min max
## <dbl> <dbl>
## 1 -5 5
get_sentiments("loughran")
## # A tibble: 4,150 × 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ℹ 4,140 more rows
get_sentiments("loughran") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n)) %>%
ggplot(aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Sentiment Counts in Loughran",
x = "Counts",
y = "Sentiment"
)
get_sentiments("nrc")
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
get_sentiments("nrc") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n)) %>%
ggplot(aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
labs(title = "Sentiment Counts in NRC",
x = "Sentiment",
y = "Counts")
tidy_review %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment) %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word2 = fct_reorder(word, n)) %>%
# 시각화
ggplot(aes(x = word2, y = n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip() +
labs(
title = "Sentiment Word Counts",
x = "Words"
)
pivot_widertidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(Stars, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n) %>%
mutate(
overall_sentiment = positive - negative,
Stars = fct_reorder(factor(Stars), overall_sentiment)) %>%
# 시각화
ggplot(aes(x = Stars, y = overall_sentiment, fill = Stars)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
title = "Overall Sentiment by Stars",
subtitle = "Reviews for Robotic Vacuums",
x = "Stars",
y = "Overall Sentiment"
)
Some more natural language processing (NLP) vocabulary:
cast_dtm(문서열, 용어열, 단어 수)library(tm)
tidy_review %>%
count(word, id) %>%
cast_dtm(id, word, n)
## <<DocumentTermMatrix (documents: 1791, terms: 9611)>>
## Non-/sparse entries: 63100/17150201
## Sparsity : 100%
## Maximal term length: NA
## Weighting : term frequency (tf)
dtm_review <- tidy_review %>%
count(word, id) %>%
cast_dtm(id, word, n) %>%
as.matrix()
dtm_review[1:4, 2000:2004]
## Terms
## Docs consistency consistent consistently consistently.lack consisting
## 1069 0 0 0 0 0
## 425 0 0 0 0 0
## 113 0 0 0 0 0
## 367 0 0 0 0 0
topicmodles::LDA
k: the number of topics we want the model to
produce.method: estimation method(“VEM” or “Gibbs)list(seed): 시뮬레이션 시드를 설정해서 reproducible하게
해줌library(topicmodels)
lda_out <- LDA(
dtm_review,
k = 2,
method = "Gibbs",
control = list(seed = 42)
)
glimpse()glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:76193] 2 2 2 1 2 2 2 1 2 1 ...
## ..@ alpha : num 25
## ..@ call : language LDA(x = dtm_review, k = 2, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 1791 9611
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 2
## ..@ terms : chr [1:9611] "0.3" "0.5" "00" "00am" ...
## ..@ documents : chr [1:1791] "1069" "425" "113" "367" ...
## ..@ beta : num [1:2, 1:9611] -12.8 -10.5 -12.8 -10.5 -12.8 ...
## ..@ gamma : num [1:1791, 1:2] 0.465 0.516 0.68 0.539 0.42 ...
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:63100] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ j : int [1:63100] 1 234 591 1466 1682 1689 1750 1753 2086 2175 ...
## .. ..$ v : num [1:63100] 2 2 2 1 2 2 2 2 2 2 ...
## .. ..$ nrow: int 1791
## .. ..$ ncol: int 9611
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -549523
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 76193
tidy()lda_topic <- lda_out %>%
tidy(matrix = "beta")
lda_topic %>%
arrange(desc(beta))
## # A tibble: 19,222 × 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 hair 0.0240
## 2 2 clean 0.0222
## 3 2 cleaning 0.0199
## 4 1 vacuum 0.0194
## 5 1 house 0.0188
## 6 1 floors 0.0175
## 7 1 day 0.0154
## 8 1 floor 0.0150
## 9 1 job 0.0143
## 10 1 love 0.0142
## # ℹ 19,212 more rows
The key is to find topics that are different where the topics don’t repeat
lda_topic %>%
group_by(topic) %>%
slice_max(beta, n = 15) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta)) %>%
# 시각화
ggplot(aes(term2, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
lda_out <- LDA(
dtm_review,
k = 3,
method = "Gibbs",
control = list(seed = 42)
)
lda_topic <- lda_out %>%
tidy(matrix = "beta")
lda_topic %>%
group_by(topic) %>%
slice_max(beta, n = 15) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta)) %>%
# 시각화
ggplot(aes(term2, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
lda_out <- LDA(
dtm_review,
k = 4,
method = "Gibbs",
control = list(seed = 42)
)
lda_topic <- lda_out %>%
tidy(matrix = "beta")
lda_topic %>%
group_by(topic) %>%
slice_max(beta, n = 15) %>%
ungroup() %>%
mutate(term2 = fct_reorder(term, beta)) %>%
# 시각화
ggplot(aes(term2, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
1: Rugs, 2: Floors, 3: Functionality, 4: Performance 으로 추정가능(이유는 역시 모른다… ㅋ;;)
모델 수를 늘려가면서 중복되는 값이 생길 때가 적절한 k라고 할 수 있다.