You can also embed plots, for example:
## # A tibble: 2 x 2
## Product starts_mean
## <chr> <dbl>
## 1 iRobot Roomba 650 for Pets 4.49
## 2 iRobot Roomba 880 for Pets and Allergies 4.42
Text Data is categorical…
## # A tibble: 2 x 2
## Product number_rows
## <chr> <int>
## 1 iRobot Roomba 880 for Pets and Allergies 1200
## 2 iRobot Roomba 650 for Pets 633
Some natural NLP vocabulary:
unnest_tokens Is like a gather() function but for all words in the text column, remove punctuation,each word is lowercase and white space has been removed too.
tidy_review <- review_data %>%
unnest_tokens(word,Review)
datatable(data =tidy_review %>%
count(word) %>%
arrange(desc(n)) ,style = "Bootstrap")## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
tidy_review2 <- review_data %>%
mutate(id = row_number()) %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words)## Joining, by = "word"
tidy_review2 %>%
count(word) %>%
arrange(desc(n)) %>%
filter(n > 300) %>%
ggplot(
aes(x = word, y = n)
) +
geom_col() +
theme_minimal() +
coord_flip() +
ggtitle("Review Word Counts")custom_stopwords <- tribble(~word, ~lexicon,
"roomba", "CUSTOM",
"2", "CUSTOM")
stop_words_c <- stop_words %>%
bind_rows(custom_stopwords)
tidy_review3 <- review_data %>%
mutate(id = row_number()) %>%
unnest_tokens(word, Review) %>%
anti_join(stop_words_c)## Joining, by = "word"
tidy_review3 %>%
count(word, Product) %>%
group_by(Product) %>%
top_n(10,n) %>%
ungroup() %>%
mutate(word = fct_reorder(word,n)) %>%
ggplot(
aes(x = word, y = n, fill = Product)
) +
geom_col(show.legend = F) +
facet_wrap(~ Product, scales = "free_y") +
theme_minimal() +
coord_flip() +
ggtitle("Review Word Counts")tidy_words <- tidy_review3 %>% count(word)
wordcloud(
words = tidy_words$word,
freq = tidy_words$n,
max.words = 30,
random.order = FALSE,
colors = brewer.pal(6,"Dark2")
)## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
# Afinn dictionary: assigns scores to words, positive scores, positive words and negative scores for negative words
get_sentiments("afinn") %>%
summarize(
min = min(value),
max = max(value)
)## # A tibble: 1 x 2
## min max
## <dbl> <dbl>
## 1 -5 5
#Loughran dictionary:
sentiment_counts <- get_sentiments("loughran") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n))
ggplot(sentiment_counts, aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Sentiment Counts in Loughran",
x = "Counts",
y = "Sentiment"
)# nrc dictionary
# Count the number of words associated with each sentiment in nrcs
sentiment_counts2 <- get_sentiments("nrc") %>%
count(sentiment) %>%
mutate(sentiment2 = fct_reorder(sentiment, n))
ggplot(sentiment_counts2, aes(x = sentiment2, y = n)) +
geom_col() +
coord_flip() +
labs(
title = "Sentiment Counts in nrc",
x = "Counts",
y = "Sentiment"
)tidy_review_sentiment <- tidy_review3 %>%
inner_join(get_sentiments("loughran")) %>%
count(word, sentiment) %>%
filter(sentiment %in% c("positive", "negative", "uncertainty")) %>%
group_by(sentiment) %>%
top_n(10,n) %>%
ungroup() %>%
mutate(word = fct_reorder(word, n))
ggplot(tidy_review_sentiment,
aes(x = word, y = n , fill = sentiment)) +
geom_col() +
facet_wrap(~sentiment, scales = "free")+
coord_flip() ## Improving sentiment analysis
sentiment_stars <- tidy_review %>%
inner_join(get_sentiments("bing")) %>%
count(Stars,sentiment) %>%
spread(sentiment,n) %>%
mutate(overall_sentiment = positive - negative,
Stars = fct_reorder(as.character(Stars), overall_sentiment))## Joining, by = "word"
ggplot(sentiment_stars,
aes(x = Stars, y = overall_sentiment, fill = as.factor(Stars))) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Overall sentiment stars")Searches for patterns of words ocurring together within and across a collection documents, also known as corpus.
dtm_matrix<- tidy_review %>%
mutate(id = row_number()) %>%
inner_join(get_sentiments("bing")) %>%
count(word, id) %>%
sample_n(size = 500) %>%
cast_dtm(id, word, n) %>% as.matrix()## Joining, by = "word"
## Formal class 'LDA_Gibbs' [package "topicmodels"] with 16 slots
## ..@ seedwords : NULL
## ..@ z : int [1:500] 1 1 2 2 1 1 1 1 1 1 ...
## ..@ alpha : num 25
## ..@ call : language LDA(x = dtm_matrix, k = 2, method = "Gibbs", control = list(seed = 42))
## ..@ Dim : int [1:2] 500 211
## ..@ control :Formal class 'LDA_Gibbscontrol' [package "topicmodels"] with 14 slots
## ..@ k : int 2
## ..@ terms : chr [1:211] "thank" "enthusiasm" "hung" "well" ...
## ..@ documents : chr [1:500] "222819" "93079" "26657" "51889" ...
## ..@ beta : num [1:2, 1:211] -4.89 -7.87 -5.54 -7.87 -7.94 ...
## ..@ gamma : num [1:500, 1:2] 0.51 0.51 0.49 0.49 0.51 ...
## ..@ wordassignments:List of 5
## .. ..$ i : int [1:500] 1 2 3 4 5 6 7 8 9 10 ...
## .. ..$ j : int [1:500] 1 2 3 4 5 6 7 6 8 9 ...
## .. ..$ v : num [1:500] 1 1 2 2 1 1 1 1 1 2 ...
## .. ..$ nrow: int 500
## .. ..$ ncol: int 211
## .. ..- attr(*, "class")= chr "simple_triplet_matrix"
## ..@ loglikelihood : num -2492
## ..@ iter : int 2000
## ..@ logLiks : num(0)
## ..@ n : int 500
lda_topics <- lda_out %>%
tidy(matrix = "beta") %>%
arrange(desc(beta)) %>%
group_by(topic) %>%
top_n(15,beta) %>%
ungroup() %>%
mutate(term = fct_reorder(term,beta))
ggplot(lda_topics,
aes(x = term,
y =beta,
fill = as.factor(topic))) +
geom_col() +
facet_wrap(~topic, scales = "free") +
coord_flip()