This report is a summary of lesson by Datacamp

library(tidyverse)
library(tidytext)
library(wordcloud)

review_data <- read_csv("Roomba Reviews.csv")

theme_set(theme_bw())

1. Wrangling Text

review_data %>% 
  unnest_tokens(output = word, Review) %>% 
  anti_join(stop_words) %>% 
  count(word) %>% 
  arrange(desc(n))
## # A tibble: 9,613 × 2
##    word         n
##    <chr>    <int>
##  1 roomba    2289
##  2 clean     1205
##  3 vacuum     989
##  4 hair       900
##  5 cleaning   809
##  6 time       795
##  7 house      745
##  8 floors     657
##  9 day        579
## 10 floor      561
## # ℹ 9,603 more rows

2. Visualizing Text

2.1 Plotting word counts

  • row_number: 행번호 반환
tidy_review <- 
  review_data %>% 
    mutate(id = row_number()) %>% 
    unnest_tokens(word, Review) %>% 
    anti_join(stop_words)

word_count <- 
  tidy_review %>% 
    count(word) %>% 
    arrange(desc(n))

word_count %>% 
  filter(n > 300) %>% 
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  coord_flip() +
  scale_y_discrete(expand = c(0.01,0)) +
  ggtitle("Review Word Counts")

2.2 Improving word count plots

  • tribble()을 통해 사용자 정의 stop_words 정의 가능
custom_stop_words <- 
  tribble(
    ~word, ~lexicon,
    "roomba", "CUSTOM",
    "2", "CUSTOM"
  )

stop_words2 <- stop_words %>% 
  bind_rows(custom_stop_words)
  • fct_reorder() 배열 순서 정의
tidy_review <- 
  review_data %>% 
    mutate(id = row_number()) %>% 
    select(id, Date, Product, Stars, Review) %>% 
    unnest_tokens(word, Review) %>% 
    anti_join(stop_words2)

word_count <- tidy_review %>% 
  count(word) %>% 
  filter(n > 300) %>% 
  mutate(word2 = fct_reorder(word, n))

ggplot(word_count, aes(x = word2, y = n)) +
  geom_col() +
  coord_flip() +
  ggtitle("Review Word Counts")

2.3 Faceting word count plots

  • slice_max()
word_count <- tidy_review %>% 
  count(word, Product) %>% 
  # Product 별로 상위 10개 word 출력, 총 20개 반환
  group_by(Product) %>% 
  slice_max(n, n = 10) %>% 
  # 그룹화 이후 그룹해제도 해주는 것이 좋음
  ungroup() %>% 
  mutate(word2 = fct_reorder(word, n)) %>% 
  print()
## # A tibble: 20 × 4
##    word     Product                                      n word2   
##    <chr>    <chr>                                    <int> <fct>   
##  1 clean    iRobot Roomba 650 for Pets                 390 clean   
##  2 vacuum   iRobot Roomba 650 for Pets                 311 vacuum  
##  3 hair     iRobot Roomba 650 for Pets                 305 hair    
##  4 time     iRobot Roomba 650 for Pets                 301 time    
##  5 floors   iRobot Roomba 650 for Pets                 252 floors  
##  6 house    iRobot Roomba 650 for Pets                 251 house   
##  7 cleaning iRobot Roomba 650 for Pets                 249 cleaning
##  8 day      iRobot Roomba 650 for Pets                 209 day     
##  9 floor    iRobot Roomba 650 for Pets                 207 floor   
## 10 run      iRobot Roomba 650 for Pets                 180 run     
## 11 clean    iRobot Roomba 880 for Pets and Allergies   815 clean   
## 12 vacuum   iRobot Roomba 880 for Pets and Allergies   678 vacuum  
## 13 hair     iRobot Roomba 880 for Pets and Allergies   595 hair    
## 14 cleaning iRobot Roomba 880 for Pets and Allergies   560 cleaning
## 15 880      iRobot Roomba 880 for Pets and Allergies   518 880     
## 16 house    iRobot Roomba 880 for Pets and Allergies   494 house   
## 17 time     iRobot Roomba 880 for Pets and Allergies   494 time    
## 18 floors   iRobot Roomba 880 for Pets and Allergies   405 floors  
## 19 love     iRobot Roomba 880 for Pets and Allergies   403 love    
## 20 dust     iRobot Roomba 880 for Pets and Allergies   399 dust
ggplot(word_count, aes(x = word2, y = n, fill = Product)) +
  geom_col(show.legend = TRUE) +
  facet_wrap(~ Product, scales = "free_y") +
  coord_flip() +
  ggtitle("Review Word Counts")

2.4 Plotting word clouds

word_counts <- tidy_review %>% 
  count(word)

wordcloud(
  words = word_counts$word,
  freq = word_counts$n,
  max.words = 30,
  colors = "blue"
)

3. Sentiment Analysis

3.1 Sentiment dictionaries

The most straightforward way to conduct sentiment analysis is to use an existing sentiment lexicon or dictionary.

3.1.1 bing

  • positive / negative
# bing 사전 출력
get_sentiments("bing")
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows
get_sentiments("bing") %>% 
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

3.1.2 afinn

  • sentiment 정도가 score로 반환
library(textdata)

get_sentiments("afinn")
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows
get_sentiments("afinn") %>% 
  summarize(
    min = min(value),
    max = max(value)
  )
## # A tibble: 1 × 2
##     min   max
##   <dbl> <dbl>
## 1    -5     5

3.1.3 loughran

get_sentiments("loughran")
## # A tibble: 4,150 × 2
##    word         sentiment
##    <chr>        <chr>    
##  1 abandon      negative 
##  2 abandoned    negative 
##  3 abandoning   negative 
##  4 abandonment  negative 
##  5 abandonments negative 
##  6 abandons     negative 
##  7 abdicated    negative 
##  8 abdicates    negative 
##  9 abdicating   negative 
## 10 abdication   negative 
## # ℹ 4,140 more rows
get_sentiments("loughran") %>% 
  count(sentiment) %>% 
  mutate(sentiment2 = fct_reorder(sentiment, n)) %>% 
  
  ggplot(aes(x = sentiment2, y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Sentiment Counts in Loughran",
    x = "Counts", 
    y = "Sentiment"
  )

3.1.4 nrc

get_sentiments("nrc")
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows
get_sentiments("nrc") %>% 
  count(sentiment) %>% 
  mutate(sentiment2 = fct_reorder(sentiment, n)) %>% 
  ggplot(aes(x = sentiment2, y = n)) +
  geom_col() +
  coord_flip() +
  labs(title = "Sentiment Counts in NRC", 
       x = "Sentiment", 
       y = "Counts")

3.2 Appending dictionaries

tidy_review %>% 
  inner_join(get_sentiments("loughran")) %>% 
  count(word, sentiment) %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  group_by(sentiment) %>% 
  slice_max(n, n = 10) %>% 
  ungroup() %>% 
  mutate(word2 = fct_reorder(word, n)) %>% 
  # 시각화
  ggplot(aes(x = word2, y = n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free") +
  coord_flip() +
  labs(
    title = "Sentiment Word Counts",
    x = "Words"
  )

3.3 Improving sentiment analysis

  • pivot_wider
tidy_review %>% 
  inner_join(get_sentiments("bing")) %>% 
  count(Stars, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n) %>% 
  mutate(
    overall_sentiment = positive - negative,
    Stars = fct_reorder(factor(Stars), overall_sentiment)) %>% 
  # 시각화
  ggplot(aes(x = Stars, y = overall_sentiment, fill = Stars)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Overall Sentiment by Stars",
    subtitle = "Reviews for Robotic Vacuums",
    x = "Stars",
    y = "Overall Sentiment"
  )

4. Topic Modeling

4.1 Latent Dirichlet allocation

4.1.1 Unsupervised learning

Some more natural language processing (NLP) vocabulary:

  • Latent Dirichlet allocation(LDA) is a standard topic model
    • LDA는 문서들에서 주제를 자동으로 찾아내는 확률 기반 모델
  • A collection of documents is known as a corpus
  • Bag-of-words is treating every word in a document separately
    • 문서 내의 모든 단어를 순서 없이 개별적으로 취급하는 방식 - 순서나 문법을 무시하고, 등장 횟수만 고려
  • Topic models find patterns of words appearing together
  • Searching for patterns rather than predicting is known as unsupervised learning
    • 예측이 아닌 패턴 탐색을 하는게 핵심

4.1.2 Clustering vs. topic modeling

  • Clustering
    • Clusters are uncovered based on distance, which is continuous.
    • Every object is assigned to a single cluster.
  • Topic Modeling
    • Topics are uncovered based on word frequency, which is discrete.
    • Every document is a mixture (i.e., partial member) of every topic.

4.2 Document term matrices (DTM)

  • 모델링하기 전 DTM 형식으로 만들어야 함
  • cast_dtm(문서열, 용어열, 단어 수)
library(tm)

tidy_review %>% 
  count(word, id) %>% 
  cast_dtm(id, word, n)
## <<DocumentTermMatrix (documents: 1791, terms: 9611)>>
## Non-/sparse entries: 63100/17150201
## Sparsity           : 100%
## Maximal term length: NA
## Weighting          : term frequency (tf)
  • 1791개의 문서와 9611개의 단어가 있음을 확인할 수 있다.
dtm_review <- tidy_review %>% 
  count(word, id) %>% 
  cast_dtm(id, word, n) %>% 
  as.matrix()

dtm_review[1:4, 2000:2004]
##       Terms
## Docs   consistency consistent consistently consistently.lack consisting
##   1069           0          0            0                 0          0
##   425            0          0            0                 0          0
##   113            0          0            0                 0          0
##   367            0          0            0                 0          0

4.3 Running topic models

  • topicmodles::LDA
    • k: the number of topics we want the model to produce.
    • method: estimation method(“VEM” or “Gibbs)
    • list(seed): 시뮬레이션 시드를 설정해서 reproducible하게 해줌
library(topicmodels)

lda_out <- LDA(
  dtm_review,
  k = 2,
  method = "Gibbs",
  control = list(seed = 42)
)
  • glimpse()
glimpse(lda_out)
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
##   ..@ seedwords      : NULL
##   ..@ z              : int [1:76193] 2 2 2 1 2 2 2 1 2 1 ...
##   ..@ alpha          : num 25
##   ..@ call           : language LDA(x = dtm_review, k = 2, method = "Gibbs", control = list(seed = 42))
##   ..@ Dim            : int [1:2] 1791 9611
##   ..@ control        :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
##   ..@ k              : int 2
##   ..@ terms          : chr [1:9611] "0.3" "0.5" "00" "00am" ...
##   ..@ documents      : chr [1:1791] "1069" "425" "113" "367" ...
##   ..@ beta           : num [1:2, 1:9611] -12.8 -10.5 -12.8 -10.5 -12.8 ...
##   ..@ gamma          : num [1:1791, 1:2] 0.465 0.516 0.68 0.539 0.42 ...
##   ..@ wordassignments:List of 5
##   .. ..$ i   : int [1:63100] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ j   : int [1:63100] 1 234 591 1466 1682 1689 1750 1753 2086 2175 ...
##   .. ..$ v   : num [1:63100] 2 2 2 1 2 2 2 2 2 2 ...
##   .. ..$ nrow: int 1791
##   .. ..$ ncol: int 9611
##   .. ..- attr(*, "class")= chr "simple_triplet_matrix"
##   ..@ loglikelihood  : num -549523
##   ..@ iter           : int 2000
##   ..@ logLiks        : num(0) 
##   ..@ n              : int 76193
  • tidy()
lda_topic <- lda_out %>% 
  tidy(matrix = "beta")

lda_topic %>% 
  arrange(desc(beta))
## # A tibble: 19,222 × 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 hair     0.0240
##  2     2 clean    0.0222
##  3     2 cleaning 0.0199
##  4     1 vacuum   0.0194
##  5     1 house    0.0188
##  6     1 floors   0.0175
##  7     1 day      0.0154
##  8     1 floor    0.0150
##  9     1 job      0.0143
## 10     1 love     0.0142
## # ℹ 19,212 more rows

4.4 Interpreting topics

The key is to find topics that are different where the topics don’t repeat

lda_topic %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 15) %>% 
  ungroup() %>% 
  mutate(term2 = fct_reorder(term, beta)) %>% 
  # 시각화
  ggplot(aes(term2, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

  • 각 주제별로 높은 확률로 등장하는 단어들이 나열
  • 하지만 이러한 분류는 주관적임
  • 1: Performance, 2: Functionality 으로 추정가능

Topic 3 ?

lda_out <- LDA(
  dtm_review,
  k = 3,
  method = "Gibbs",
  control = list(seed = 42)
)

lda_topic <- lda_out %>% 
  tidy(matrix = "beta")

lda_topic %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 15) %>% 
  ungroup() %>% 
  mutate(term2 = fct_reorder(term, beta)) %>% 
  # 시각화
  ggplot(aes(term2, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

  • 1: Performance, 2: Functionality, 3: Frustrations 으로 추정가능(이유는 모른다… ㅋ;;)

Topic 4 ?

lda_out <- LDA(
  dtm_review,
  k = 4,
  method = "Gibbs",
  control = list(seed = 42)
)

lda_topic <- lda_out %>% 
  tidy(matrix = "beta")

lda_topic %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 15) %>% 
  ungroup() %>% 
  mutate(term2 = fct_reorder(term, beta)) %>% 
  # 시각화
  ggplot(aes(term2, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

  • 1: Rugs, 2: Floors, 3: Functionality, 4: Performance 으로 추정가능(이유는 역시 모른다… ㅋ;;)

  • 모델 수를 늘려가면서 중복되는 값이 생길 때가 적절한 k라고 할 수 있다.