head(review_data)
## # A tibble: 6 x 5
## Date Product Stars Title Review
## <chr> <chr> <dbl> <chr> <chr>
## 1 2/28/15 iRobot Roomba ~ 5 Five Stars "You would not believe how well t~
## 2 1/12/15 iRobot Roomba ~ 4 Four Stars "You just walk away and it does t~
## 3 12/26/~ iRobot Roomba ~ 5 Awesome love~ "You have to Roomba proof your ho~
## 4 8/4/13 iRobot Roomba ~ 3 Love-hate th~ "Yes, it's a fascinating, albeit ~
## 5 12/22/~ iRobot Roomba ~ 5 This vacuum ~ "Years ago I bought one of the or~
## 6 12/27/~ iRobot Roomba ~ 5 Wow! "Wow.Wow. I never knew my floors~
Mean
review_data %>%
filter(Product == "iRobot Roomba 650 for Pets") %>%
summarize(stars_mean = mean(Stars))
## # A tibble: 1 x 1
## stars_mean
## <dbl>
## 1 4.49
Mean Min Max
review_data %>%
group_by(Product) %>%
summarize(stars_mean = mean(Stars),
stars_min = min(Stars),
stars_max = max(Stars))
## # A tibble: 2 x 4
## Product stars_mean stars_min stars_max
## <chr> <dbl> <dbl> <dbl>
## 1 iRobot Roomba 650 for Pets 4.49 1 5
## 2 iRobot Roomba 880 for Pets and Allergies 4.42 1 5
Counts Row for Product
review_data %>%
group_by(Product) %>%
summarize(number_rows = n())
## # A tibble: 2 x 2
## Product number_rows
## <chr> <int>
## 1 iRobot Roomba 650 for Pets 633
## 2 iRobot Roomba 880 for Pets and Allergies 1200
Arrange Descendent
review_data %>%
count(Product) %>%
arrange(desc(n))
## # A tibble: 2 x 2
## Product n
## <chr> <int>
## 1 iRobot Roomba 880 for Pets and Allergies 1200
## 2 iRobot Roomba 650 for Pets 633
Tokens
tidy_review <- review_data %>%
unnest_tokens(word, Review)
tidy_review
## # A tibble: 229,481 x 5
## Date Product Stars Title word
## <chr> <chr> <dbl> <chr> <chr>
## 1 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars you
## 2 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars would
## 3 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars not
## 4 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars believe
## 5 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars how
## 6 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars well
## 7 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars this
## 8 2/28/15 iRobot Roomba 650 for Pets 5 Five Stars works
## 9 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars you
## 10 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars just
## # ... with 229,471 more rows
Count Word
tidy_review %>%
count(word) %>%
arrange(desc(n))
## # A tibble: 10,310 x 2
## word n
## <chr> <int>
## 1 the 11785
## 2 it 7905
## 3 and 6794
## 4 to 6440
## 5 i 6034
## 6 a 5884
## 7 is 3347
## 8 of 3229
## 9 have 2470
## 10 that 2410
## # ... with 10,300 more rows
Stop Words - Anti_Join
tidy_review_SW <- review_data %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_review_SW
## # A tibble: 78,868 x 5
## Date Product Stars Title word
## <chr> <chr> <dbl> <chr> <chr>
## 1 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars walk
## 2 1/12/15 iRobot Roomba 650 for Pets 4 Four Stars rest
## 3 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. roomba
## 4 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. proof
## 5 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. house
## 6 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. awesome
## 7 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. pet
## 8 12/26/13 iRobot Roomba 650 for Pets 5 Awesome love it. cleans
## 9 8/4/13 iRobot Roomba 650 for Pets 3 Love-hate this vaccuum fascinating
## 10 8/4/13 iRobot Roomba 650 for Pets 3 Love-hate this vaccuum albeit
## # ... with 78,858 more rows
Count Stop Words
tidy_review_SW %>%
count(word) %>%
arrange(desc(n))
## # A tibble: 9,672 x 2
## word n
## <chr> <int>
## 1 roomba 2286
## 2 clean 1204
## 3 vacuum 989
## 4 hair 900
## 5 cleaning 809
## 6 time 795
## 7 house 745
## 8 floors 657
## 9 day 578
## 10 floor 561
## # ... with 9,662 more rows
##VISUALIZING TEXT
tidy_review_P <- review_data %>%
mutate(id= row_number()) %>% #calcola le righe per ogni review
unnest_tokens(word, Review) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_review_P
## # A tibble: 78,868 x 6
## Date Product Stars Title id word
## <chr> <chr> <dbl> <chr> <int> <chr>
## 1 1/12/15 iRobot Roomba 650 for P~ 4 Four Stars 2 walk
## 2 1/12/15 iRobot Roomba 650 for P~ 4 Four Stars 2 rest
## 3 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 roomba
## 4 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 proof
## 5 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 house
## 6 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 awesome
## 7 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 pet
## 8 12/26/13 iRobot Roomba 650 for P~ 5 Awesome love it. 3 cleans
## 9 8/4/13 iRobot Roomba 650 for P~ 3 Love-hate this vacc~ 4 fascinati~
## 10 8/4/13 iRobot Roomba 650 for P~ 3 Love-hate this vacc~ 4 albeit
## # ... with 78,858 more rows
Filter > 300
word_counts_P <- tidy_review_P %>%
count(word) %>%
filter(n > 300) %>% #solo palabras repetidas mas de 3o0 veces
arrange(desc(n))
word_counts_P
## # A tibble: 25 x 2
## word n
## <chr> <int>
## 1 roomba 2286
## 2 clean 1204
## 3 vacuum 989
## 4 hair 900
## 5 cleaning 809
## 6 time 795
## 7 house 745
## 8 floors 657
## 9 day 578
## 10 floor 561
## # ... with 15 more rows
Review Word Counts + coord_flip()
ggplot(word_counts_P, aes(x = word, y = n)) +
geom_col() +
coord_flip() + #mettere n in x, e word in y
ggtitle("Review Word Counts")
Review Word Counts
ggplot(word_counts_P, aes(x = n, y = word)) +
geom_col() +
ggtitle("Review Word Counts")
##Custom Stop Words
custom_stop_words <- tribble(
~word, ~lexicon,
"roomba", "CUSTOM",
"2", "CUSTOM"
) #crea un dataframe
custom_stop_words
## # A tibble: 2 x 2
## word lexicon
## <chr> <chr>
## 1 roomba CUSTOM
## 2 2 CUSTOM
Stop Word + Custom Stop Words
stop_words2 <- stop_words %>%
bind_rows(custom_stop_words)
Tokens + Stopwords
tidy_review_SW2 <- review_data %>%
mutate(id= row_number()) %>% # n row per id
select(id, Date, Product, Stars, Review) %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words2)
## Joining, by = "word"
tidy_review_SW2
## # A tibble: 76,175 x 5
## id Date Product Stars word
## <int> <chr> <chr> <dbl> <chr>
## 1 2 1/12/15 iRobot Roomba 650 for Pets 4 walk
## 2 2 1/12/15 iRobot Roomba 650 for Pets 4 rest
## 3 3 12/26/13 iRobot Roomba 650 for Pets 5 proof
## 4 3 12/26/13 iRobot Roomba 650 for Pets 5 house
## 5 3 12/26/13 iRobot Roomba 650 for Pets 5 awesome
## 6 3 12/26/13 iRobot Roomba 650 for Pets 5 pet
## 7 3 12/26/13 iRobot Roomba 650 for Pets 5 cleans
## 8 4 8/4/13 iRobot Roomba 650 for Pets 3 fascinating
## 9 4 8/4/13 iRobot Roomba 650 for Pets 3 albeit
## 10 4 8/4/13 iRobot Roomba 650 for Pets 3 expensive
## # ... with 76,165 more rows
Word Count + Filter + Arrange
word_counts_SW2 <- tidy_review_SW2 %>%
count(word) %>%
filter(n > 300) %>%
arrange(desc(n))
word_counts_SW2
## # A tibble: 23 x 2
## word n
## <chr> <int>
## 1 clean 1204
## 2 vacuum 989
## 3 hair 900
## 4 cleaning 809
## 5 time 795
## 6 house 745
## 7 floors 657
## 8 day 578
## 9 floor 561
## 10 dust 543
## # ... with 13 more rows
Plot
ggplot(word_counts_SW2, aes(x = n, y = word)) +
geom_col() +
ggtitle("Review Word Counts")
Word Count + TOP10 + Factor Reorder
word_counts_SW3 <- tidy_review_SW2 %>%
count(word) %>%
# filter(n > 300) %>%
top_n(10, n) %>%
mutate(word2 = fct_reorder(word, n)) #categorical (factor)
word_counts_SW3
## # A tibble: 10 x 3
## word n word2
## <chr> <int> <fct>
## 1 clean 1204 clean
## 2 cleaning 809 cleaning
## 3 day 578 day
## 4 dust 543 dust
## 5 floor 561 floor
## 6 floors 657 floors
## 7 hair 900 hair
## 8 house 745 house
## 9 time 795 time
## 10 vacuum 989 vacuum
Plot
ggplot(word_counts_SW3, aes(x = n, y = word2)) +
geom_col() +
ggtitle("Review Word Counts")
##EXAMPLE
word_counts_EX <- tidy_review %>%
count(word, Product) %>%
group_by(Product) %>%
top_n(20, n) %>%
ungroup() %>%
mutate(word2 = fct_reorder(word, n))
word_counts_EX
## # A tibble: 40 x 4
## word Product n word2
## <chr> <chr> <int> <fct>
## 1 a iRobot Roomba 650 for Pets 1901 a
## 2 a iRobot Roomba 880 for Pets and Allergies 3983 a
## 3 and iRobot Roomba 650 for Pets 2165 and
## 4 and iRobot Roomba 880 for Pets and Allergies 4629 and
## 5 but iRobot Roomba 650 for Pets 529 but
## 6 but iRobot Roomba 880 for Pets and Allergies 1204 but
## 7 for iRobot Roomba 650 for Pets 583 for
## 8 for iRobot Roomba 880 for Pets and Allergies 1357 for
## 9 have iRobot Roomba 650 for Pets 811 have
## 10 have iRobot Roomba 880 for Pets and Allergies 1659 have
## # ... with 30 more rows
ggplot(word_counts_EX, aes(x = n, y = word2)) +
geom_col() +
ggtitle("Review Word Counts")
ggplot(word_counts_EX, aes(x = word2, y = n, fill = Product)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ Product, scales = "free_y") +
coord_flip() +
ggtitle("Roomba Word Counts")