The code for this is online at https://github.com/luizfelipebrito/Natural-language-processing-and-Text-Mining-with-R
This dataset consists of reviews of fine foods from amazon. This allowed us to analyze which words are used most frequenlty in reviews. Furthermore, we can use the tools of text mining to approach the emotional content of text programmatically to infer whether a review is positive or negative, or perhaps characterized by some other more nuanced emotional content like surprise or disgust.
Amazon Fine Food Reviews The data span a period of more than 10 years. Analyze ~500,000 food reviews from Amazon https://www.kaggle.com/snap/amazon-fine-food-reviews/downloads/amazon-fine-food-reviews.zip/2
rm(list = ls())
cat("\014")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidytext)
library(stringr)
library(tidyr)
library(wordcloud)
## Loading required package: RColorBrewer
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(hunspell)
library(SnowballC)
library(xtable)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
setwd("D:\\Text_Mining")
raw_text <- read.csv("Reviews.csv", header = TRUE)
names(raw_text)[names(raw_text) == "Id"] <- "id_review"
names(raw_text)[names(raw_text) == "Summary"] <- "summary_review"
names(raw_text)[names(raw_text) == "Text"] <- "text_review"
raw_text <- raw_text %>% select(id_review, summary_review, text_review)
Sometimes, we have some structure and extra text that we do not want to include in our analysis.
cleaned_text <- raw_text %>%
filter(str_detect(text_review, "^[^>]+[A-Za-z\\d]") | text_review !="")
Every raw text dataset will require different steps for data cleaning, which will often involve some trial and error, and exploration on unusual cases in the dataset.
cleaned_text$text_review <- gsub("[_]", "", cleaned_text$text_review)
cleaned_text$text_review <- gsub("<br />", "", cleaned_text$text_review)
Token is a meaningful unit of text, most often a word, that we are interested in using for further analysis, and tokenization is the process of splitting text into tokens.
text_df <- tibble(id_review = cleaned_text$id_review , text_review = cleaned_text$text_review)
text_df <- text_df %>% unnest_tokens(word, text_review)
getStemLanguages() %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| x |
|---|
| danish |
| dutch |
| english |
| finnish |
| french |
| german |
| hungarian |
| italian |
| norwegian |
| porter |
| portuguese |
| romanian |
| russian |
| spanish |
| swedish |
| turkish |
We have split each row so that there is one token (word) in each row of the new data frame.
text_df$word <- wordStem(text_df$word, language = "english")
Punctuation has been stripped. The words were converted to lowercase, which makes them easier to compare or combine with other datasets.
head(table(text_df$word)) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| Var1 | Freq |
|---|---|
| 0 | 154 |
| 0,45 | 1 |
| 0.0006mg | 1 |
| 0.035 | 1 |
| 0.05 | 3 |
| 0.09 | 1 |
data(stop_words)
text_df <- text_df %>%
anti_join(stop_words, "word")
xtable(head(text_df %>%
count(word, sort = TRUE))) %>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| word | n |
|---|---|
| tast | 15974 |
| flavor | 13385 |
| product | 11914 |
| love | 11598 |
| coffe | 11554 |
| tri | 10440 |
Plot_01_word_count
text_df %>%
count(word, sort = TRUE) %>%
filter(n > 3000) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
Sentiment_Analysis <- text_df %>%
inner_join(get_sentiments("bing"), "word") %>%
count(id_review, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
One way to analyze the sentiment of a text is to consider the text as a combination of its individual word, and the sentiment content of the whole text as the sum of the sentiment content of the individual words.
head(Sentiment_Analysis)%>%
kable() %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| id_review | negative | positive | sentiment |
|---|---|---|---|
| 1 | 3 | 0 | -3 |
| 2 | 1 | 0 | -1 |
| 3 | 0 | 2 | 2 |
| 5 | 0 | 1 | 1 |
| 6 | 2 | 3 | 1 |
| 7 | 1 | 3 | 2 |
Sentiment_Analysis_Word_Count <- text_df %>%
inner_join(get_sentiments("bing"), "word") %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
Pot_02_word_count
Sentiment_Analysis_Word_Count %>%
group_by(sentiment) %>%
top_n(12, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to Sentiment", x = NULL) +
coord_flip()
Sentiment_Analysis_Word_Contribution <- text_df %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(occurences = n(), contribution = sum(score))
PLot_03_word_contribution
Sentiment_Analysis_Word_Contribution %>%
top_n(50, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip()
plot_04_word_cloud
text_df %>%
anti_join(stop_words, "word") %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
pplot_05_word_clouD
text_df %>%
inner_join(get_sentiments("bing"), "word") %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
Term Frequency (tf) It is one measure of how important a word may be and how frenquently a word occurs in a document. Inverse Document Frequency (idf) It decreases the weight for commonly used words and increases the weight for words that are not used very much in a collection of documents. Calculating tf-idf attemps to find the words that are importantin a text, but not too common.
term_frequency_review <- text_df %>% count(word, sort = TRUE)
term_frequency_review$total_words <- as.numeric(term_frequency_review %>% summarize(total = sum(n)))
term_frequency_review$document <- as.character("Review")
term_frequency_review <- term_frequency_review %>%
bind_tf_idf(word, document, n)
Plot_06_tf_idf
term_frequency_review %>%
arrange(desc(tf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(document) %>%
top_n(15, tf) %>%
ungroup() %>%
ggplot(aes(word, tf, fill = document)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~document, ncol = 2, scales = "free") +
coord_flip()