Review Data

head(review_data)
## # A tibble: 6 x 5
##   Date    Product         Stars Title         Review                            
##   <chr>   <chr>           <dbl> <chr>         <chr>                             
## 1 2/28/15 iRobot Roomba ~     5 Five Stars    "You would not believe how well t~
## 2 1/12/15 iRobot Roomba ~     4 Four Stars    "You just walk away and it does t~
## 3 12/26/~ iRobot Roomba ~     5 Awesome love~ "You have to Roomba proof your ho~
## 4 8/4/13  iRobot Roomba ~     3 Love-hate th~ "Yes, it's a fascinating, albeit ~
## 5 12/22/~ iRobot Roomba ~     5 This vacuum ~ "Years ago I bought one of the or~
## 6 12/27/~ iRobot Roomba ~     5 Wow!          "Wow.Wow.  I never knew my floors~

Wrangling Text

Mean

review_data %>%
  filter(Product == "iRobot Roomba 650 for Pets") %>%
  summarize(stars_mean = mean(Stars))
## # A tibble: 1 x 1
##   stars_mean
##        <dbl>
## 1       4.49

Mean Min Max

review_data %>%
  group_by(Product) %>%
  summarize(stars_mean = mean(Stars),
             stars_min = min(Stars),
             stars_max = max(Stars))
## # A tibble: 2 x 4
##   Product                                  stars_mean stars_min stars_max
##   <chr>                                         <dbl>     <dbl>     <dbl>
## 1 iRobot Roomba 650 for Pets                     4.49         1         5
## 2 iRobot Roomba 880 for Pets and Allergies       4.42         1         5

Counts Row for Product

review_data %>%
  group_by(Product) %>%
    summarize(number_rows = n())
## # A tibble: 2 x 2
##   Product                                  number_rows
##   <chr>                                          <int>
## 1 iRobot Roomba 650 for Pets                       633
## 2 iRobot Roomba 880 for Pets and Allergies        1200

Arrange Descendent

review_data %>%
  count(Product) %>%
  arrange(desc(n))
## # A tibble: 2 x 2
##   Product                                      n
##   <chr>                                    <int>
## 1 iRobot Roomba 880 for Pets and Allergies  1200
## 2 iRobot Roomba 650 for Pets                 633

Tokens

tidy_review <- review_data %>%
  unnest_tokens(word, Review)

tidy_review
## # A tibble: 229,481 x 5
##    Date    Product                    Stars Title      word   
##    <chr>   <chr>                      <dbl> <chr>      <chr>  
##  1 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars you    
##  2 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars would  
##  3 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars not    
##  4 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars believe
##  5 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars how    
##  6 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars well   
##  7 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars this   
##  8 2/28/15 iRobot Roomba 650 for Pets     5 Five Stars works  
##  9 1/12/15 iRobot Roomba 650 for Pets     4 Four Stars you    
## 10 1/12/15 iRobot Roomba 650 for Pets     4 Four Stars just   
## # ... with 229,471 more rows

Count Word

tidy_review %>%
  count(word) %>%
  arrange(desc(n))
## # A tibble: 10,310 x 2
##    word      n
##    <chr> <int>
##  1 the   11785
##  2 it     7905
##  3 and    6794
##  4 to     6440
##  5 i      6034
##  6 a      5884
##  7 is     3347
##  8 of     3229
##  9 have   2470
## 10 that   2410
## # ... with 10,300 more rows

Stop Words - Anti_Join

tidy_review_SW <- review_data %>%
  unnest_tokens(word, Review) %>%
  anti_join(stop_words)
## Joining, by = "word"
tidy_review_SW
## # A tibble: 78,868 x 5
##    Date     Product                    Stars Title                  word       
##    <chr>    <chr>                      <dbl> <chr>                  <chr>      
##  1 1/12/15  iRobot Roomba 650 for Pets     4 Four Stars             walk       
##  2 1/12/15  iRobot Roomba 650 for Pets     4 Four Stars             rest       
##  3 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       roomba     
##  4 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       proof      
##  5 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       house      
##  6 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       awesome    
##  7 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       pet        
##  8 12/26/13 iRobot Roomba 650 for Pets     5 Awesome love it.       cleans     
##  9 8/4/13   iRobot Roomba 650 for Pets     3 Love-hate this vaccuum fascinating
## 10 8/4/13   iRobot Roomba 650 for Pets     3 Love-hate this vaccuum albeit     
## # ... with 78,858 more rows

Count Stop Words

tidy_review_SW %>%
  count(word) %>%
  arrange(desc(n))
## # A tibble: 9,672 x 2
##    word         n
##    <chr>    <int>
##  1 roomba    2286
##  2 clean     1204
##  3 vacuum     989
##  4 hair       900
##  5 cleaning   809
##  6 time       795
##  7 house      745
##  8 floors     657
##  9 day        578
## 10 floor      561
## # ... with 9,662 more rows

##VISUALIZING TEXT

tidy_review_P <- review_data %>%
  mutate(id= row_number()) %>%  #calcola le righe per ogni review
  unnest_tokens(word, Review) %>%
  anti_join(stop_words)
## Joining, by = "word"
tidy_review_P
## # A tibble: 78,868 x 6
##    Date     Product                  Stars Title                   id word      
##    <chr>    <chr>                    <dbl> <chr>                <int> <chr>     
##  1 1/12/15  iRobot Roomba 650 for P~     4 Four Stars               2 walk      
##  2 1/12/15  iRobot Roomba 650 for P~     4 Four Stars               2 rest      
##  3 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 roomba    
##  4 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 proof     
##  5 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 house     
##  6 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 awesome   
##  7 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 pet       
##  8 12/26/13 iRobot Roomba 650 for P~     5 Awesome love it.         3 cleans    
##  9 8/4/13   iRobot Roomba 650 for P~     3 Love-hate this vacc~     4 fascinati~
## 10 8/4/13   iRobot Roomba 650 for P~     3 Love-hate this vacc~     4 albeit    
## # ... with 78,858 more rows

Filter > 300

word_counts_P <- tidy_review_P %>%
  count(word) %>%
  filter(n > 300) %>%    #solo palabras repetidas mas de 3o0 veces
  arrange(desc(n))

word_counts_P
## # A tibble: 25 x 2
##    word         n
##    <chr>    <int>
##  1 roomba    2286
##  2 clean     1204
##  3 vacuum     989
##  4 hair       900
##  5 cleaning   809
##  6 time       795
##  7 house      745
##  8 floors     657
##  9 day        578
## 10 floor      561
## # ... with 15 more rows

Review Word Counts + coord_flip()

ggplot(word_counts_P, aes(x = word, y = n)) + 
  geom_col() +
  coord_flip() +   #mettere n in x, e word in y
  ggtitle("Review Word Counts")

Review Word Counts

ggplot(word_counts_P, aes(x = n, y = word)) + 
  geom_col() +
  ggtitle("Review Word Counts")

##Custom Stop Words

custom_stop_words <- tribble(
  ~word, ~lexicon,
  "roomba", "CUSTOM",
  "2", "CUSTOM"
) #crea un dataframe

custom_stop_words
## # A tibble: 2 x 2
##   word   lexicon
##   <chr>  <chr>  
## 1 roomba CUSTOM 
## 2 2      CUSTOM

Stop Word + Custom Stop Words

stop_words2 <- stop_words %>%
  bind_rows(custom_stop_words)

Tokens + Stopwords

tidy_review_SW2 <- review_data %>%
  mutate(id= row_number()) %>% # n row per id
  select(id, Date, Product, Stars, Review) %>%
  unnest_tokens(word, Review) %>%
  anti_join(stop_words2)
## Joining, by = "word"
tidy_review_SW2
## # A tibble: 76,175 x 5
##       id Date     Product                    Stars word       
##    <int> <chr>    <chr>                      <dbl> <chr>      
##  1     2 1/12/15  iRobot Roomba 650 for Pets     4 walk       
##  2     2 1/12/15  iRobot Roomba 650 for Pets     4 rest       
##  3     3 12/26/13 iRobot Roomba 650 for Pets     5 proof      
##  4     3 12/26/13 iRobot Roomba 650 for Pets     5 house      
##  5     3 12/26/13 iRobot Roomba 650 for Pets     5 awesome    
##  6     3 12/26/13 iRobot Roomba 650 for Pets     5 pet        
##  7     3 12/26/13 iRobot Roomba 650 for Pets     5 cleans     
##  8     4 8/4/13   iRobot Roomba 650 for Pets     3 fascinating
##  9     4 8/4/13   iRobot Roomba 650 for Pets     3 albeit     
## 10     4 8/4/13   iRobot Roomba 650 for Pets     3 expensive  
## # ... with 76,165 more rows

Word Count + Filter + Arrange

word_counts_SW2 <- tidy_review_SW2 %>%
  count(word) %>%
  filter(n > 300) %>%   
  arrange(desc(n))

word_counts_SW2
## # A tibble: 23 x 2
##    word         n
##    <chr>    <int>
##  1 clean     1204
##  2 vacuum     989
##  3 hair       900
##  4 cleaning   809
##  5 time       795
##  6 house      745
##  7 floors     657
##  8 day        578
##  9 floor      561
## 10 dust       543
## # ... with 13 more rows

Plot

ggplot(word_counts_SW2, aes(x = n, y = word)) + 
  geom_col() +
  ggtitle("Review Word Counts")

Word Count + TOP10 + Factor Reorder

word_counts_SW3 <- tidy_review_SW2 %>%
  count(word) %>%
 # filter(n > 300) %>%   
  top_n(10, n) %>%
  mutate(word2 = fct_reorder(word, n)) #categorical (factor)

word_counts_SW3
## # A tibble: 10 x 3
##    word         n word2   
##    <chr>    <int> <fct>   
##  1 clean     1204 clean   
##  2 cleaning   809 cleaning
##  3 day        578 day     
##  4 dust       543 dust    
##  5 floor      561 floor   
##  6 floors     657 floors  
##  7 hair       900 hair    
##  8 house      745 house   
##  9 time       795 time    
## 10 vacuum     989 vacuum

Plot

ggplot(word_counts_SW3, aes(x = n, y = word2)) + 
  geom_col() +
  ggtitle("Review Word Counts")

##EXAMPLE

word_counts_EX <- tidy_review %>%
  count(word, Product) %>%
  group_by(Product) %>%
  top_n(20, n) %>%
   ungroup() %>%
  mutate(word2 = fct_reorder(word, n))

word_counts_EX
## # A tibble: 40 x 4
##    word  Product                                      n word2
##    <chr> <chr>                                    <int> <fct>
##  1 a     iRobot Roomba 650 for Pets                1901 a    
##  2 a     iRobot Roomba 880 for Pets and Allergies  3983 a    
##  3 and   iRobot Roomba 650 for Pets                2165 and  
##  4 and   iRobot Roomba 880 for Pets and Allergies  4629 and  
##  5 but   iRobot Roomba 650 for Pets                 529 but  
##  6 but   iRobot Roomba 880 for Pets and Allergies  1204 but  
##  7 for   iRobot Roomba 650 for Pets                 583 for  
##  8 for   iRobot Roomba 880 for Pets and Allergies  1357 for  
##  9 have  iRobot Roomba 650 for Pets                 811 have 
## 10 have  iRobot Roomba 880 for Pets and Allergies  1659 have 
## # ... with 30 more rows
ggplot(word_counts_EX, aes(x = n, y = word2)) + 
  geom_col() +
  ggtitle("Review Word Counts")

ggplot(word_counts_EX, aes(x = word2, y = n, fill = Product)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ Product, scales = "free_y") +
  coord_flip() +
  ggtitle("Roomba Word Counts")