library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("syuzhet")
library("ggplot2")
library(R.utils)
library(tokenizers)
library(dplyr)
library(tidytext)
library(ggplot2)
library(tidyr)
It became soon apparent that the blog and twitter files are quite large, hence a random 10% set of the data was analysed
text_blog <- readLines(file.choose('en_US.blogs.txt'))
text_blog <- iconv(text_blog,"latin1","ASCII",sub = "")
factor <- 0.1
text_blog1 <- sample(text_blog,round(factor*length(text_blog)))
Here the data is converted as a corpus. All special characters (such as , |, @,) and spaces are looked up and removed. The text in converted to all lower cases, numbers, stopwords, punctuation are mapped and removed. Final step in this section is text stemming, which reduces words to their root form.
TextDoc <- Corpus(VectorSource(text_blog1))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)
text_news <- readLines(file.choose('en_US.news.txt'))
text_news <- iconv(text_news,"latin1","ASCII",sub = "")
text_news1 <- text_news
TextDoc_new <- Corpus(VectorSource(text_news1))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc_new <- tm_map(TextDoc_new, toSpace, "/")
TextDoc_new <- tm_map(TextDoc_new, toSpace, "@")
TextDoc_new <- tm_map(TextDoc_new, toSpace, "\\|")
# Convert the text to lower case
TextDoc_new <- tm_map(TextDoc_new, content_transformer(tolower))
# Remove numbers
TextDoc_new <- tm_map(TextDoc_new, removeNumbers)
# Remove english common stopwords
TextDoc_new <- tm_map(TextDoc_new, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc_new <- tm_map(TextDoc_new, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc_new <- tm_map(TextDoc_new, removePunctuation)
# Eliminate extra white spaces
TextDoc_new <- tm_map(TextDoc_new, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc_new <- tm_map(TextDoc_new, stemDocument)
text_twitter <- readLines(file.choose())
text_twitter <- iconv(text_twitter,"latin1","ASCII",sub = "")
factor <- 0.1
text_twitter1 <- sample(text_twitter,round(factor*length(text_twitter)))
# Load the data as a corpus
TextDoc_twitter <- Corpus(VectorSource(text_twitter1))
#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "/")
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "@")
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "\\|")
# Convert the text to lower case
TextDoc_twitter <- tm_map(TextDoc_twitter, content_transformer(tolower))
# Remove numbers
TextDoc_twitter <- tm_map(TextDoc_twitter, removeNumbers)
# Remove english common stopwords
TextDoc_twitter <- tm_map(TextDoc_twitter, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc_twitter <- tm_map(TextDoc_twitter, removeWords, c("s", "company", "team"))
# Remove punctuations
TextDoc_twitter <- tm_map(TextDoc_twitter, removePunctuation)
# Eliminate extra white spaces
TextDoc_twitter <- tm_map(TextDoc_twitter, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc_twitter <- tm_map(TextDoc_twitter, stemDocument)
Analogical code is applied for the news and twitter files
#Tokenization The data is split into separate words and converted to term matrices
text_blog1_tokens<- tokenize_words(text_blog1)
text_news1_tokens<- tokenize_words(text_news1)
text_twitter1_tokens<- tokenize_words(text_twitter1)
TextDoc_dtm <- TermDocumentMatrix(TextDoc)
TextDoc_dtm_news <- TermDocumentMatrix(TextDoc_new)
TextDoc_dtm_twit <- TermDocumentMatrix(TextDoc_twitter)
#Data exploration
Now we can see the most frequent positive and negative words
For the Blog data
TextDoc_dtm_td <- tidy(TextDoc_dtm)
TextDoc_dtm_sentiments <- TextDoc_dtm_td %>%
inner_join(get_sentiments("bing"), by = c(term = "word"))
TextDoc_dtm_sentiments <- TextDoc_dtm_sentiments %>% arrange(desc(count))
TextDoc_dtm_sentiments %>%
count(document, sentiment, wt = count) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
arrange(sentiment)
## # A tibble: 57,978 x 4
## document negative positive sentiment
## <chr> <dbl> <dbl> <dbl>
## 1 39933 29 2 -27
## 2 72641 21 1 -20
## 3 21443 19 1 -18
## 4 26864 25 7 -18
## 5 44638 16 0 -16
## 6 11660 15 0 -15
## 7 45887 16 1 -15
## 8 5008 17 2 -15
## 9 56153 18 3 -15
## 10 1763 18 4 -14
## # ... with 57,968 more rows
TextDoc_dtm_sentiments %>%
count(sentiment, term, wt = count) %>%
filter(n >= 1000) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
wordcloud(words = TextDoc_dtm_sentiments$term, freq = TextDoc_dtm_sentiments$count, min.freq=4,
max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))
For the news data
TextDoc_dtm_td_bigram <- TextDoc_dtm_td %>% unnest_tokens(bigram, term, token = "ngrams", n = 2)
TextDoc_dtm_td_bigram_count <- TextDoc_dtm_td_bigram %>% count(bigram, sort = TRUE)
TextDoc_dtm_td_bigram_filtered <- TextDoc_dtm_td_bigram %>%
filter(!is.na(bigram))
TextDoc_dtm_news_td <- tidy(TextDoc_dtm_news)
TextDoc_dtm_news_sentiments <- TextDoc_dtm_news_td %>%
inner_join(get_sentiments("bing"), by = c(term = "word"))
head(TextDoc_dtm_news_sentiments)
## # A tibble: 6 x 4
## term document count sentiment
## <chr> <chr> <dbl> <chr>
## 1 die 2 1 negative
## 2 applaud 3 1 positive
## 3 hot 3 1 positive
## 4 loss 3 1 negative
## 5 fall 4 1 negative
## 6 lead 4 1 positive
TextDoc_dtm_news_sentiments <- TextDoc_dtm_news_sentiments %>% arrange(desc(count))
TextDoc_dtm_news_sentiments %>%
count(document, sentiment, wt = count) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
arrange(sentiment)
## # A tibble: 48,978 x 4
## document negative positive sentiment
## <chr> <dbl> <dbl> <dbl>
## 1 42426 10 0 -10
## 2 10185 9 0 -9
## 3 23329 9 0 -9
## 4 36634 9 0 -9
## 5 46812 13 4 -9
## 6 64725 9 0 -9
## 7 1016 9 1 -8
## 8 19105 10 2 -8
## 9 25548 8 0 -8
## 10 27353 8 0 -8
## # ... with 48,968 more rows
TextDoc_dtm_news_sentiments %>%
count(sentiment, term, wt = count) %>%
filter(n >= 1000) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
wordcloud(words = TextDoc_dtm_news_sentiments$term, freq = TextDoc_dtm_news_sentiments$count, min.freq=4,
max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))
And for the twitter data
TextDoc_dtm_twit_td <- tidy(TextDoc_dtm_twit)
TextDoc_dtm_twit_sentiments <- TextDoc_dtm_twit_td %>%
inner_join(get_sentiments("bing"), by = c(term = "word"))
head(TextDoc_dtm_twit_sentiments)
## # A tibble: 6 x 4
## term document count sentiment
## <chr> <chr> <dbl> <chr>
## 1 greatest 2 1 positive
## 2 love 3 1 positive
## 3 thank 3 1 positive
## 4 lose 5 1 negative
## 5 bad 6 1 negative
## 6 critic 7 1 negative
TextDoc_dtm_twit_sentiments <- TextDoc_dtm_twit_sentiments %>% arrange(desc(count))
TextDoc_dtm_twit_sentiments %>%
count(document, sentiment, wt = count) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
arrange(sentiment)
## # A tibble: 132,596 x 4
## document negative positive sentiment
## <chr> <dbl> <dbl> <dbl>
## 1 8604 11 0 -11
## 2 13546 10 0 -10
## 3 76909 9 0 -9
## 4 192594 8 0 -8
## 5 222110 8 0 -8
## 6 231253 8 0 -8
## 7 110791 7 0 -7
## 8 124944 7 0 -7
## 9 157696 7 0 -7
## 10 169574 7 0 -7
## # ... with 132,586 more rows
TextDoc_dtm_twit_sentiments %>%
count(sentiment, term, wt = count) %>%
filter(n >= 1000) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(term = reorder(term, n)) %>%
ggplot(aes(term, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
wordcloud(words = TextDoc_dtm_twit_sentiments$term, freq = TextDoc_dtm_twit_sentiments$count, min.freq=4,
max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))