Load the data as a corpus and basic text preparation

Here the data is converted as a corpus. All special characters (such as , |, @,) and spaces are looked up and removed. The text in converted to all lower cases, numbers, stopwords, punctuation are mapped and removed. Final step in this section is text stemming, which reduces words to their root form.

TextDoc <- Corpus(VectorSource(text_blog1))

#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc <- tm_map(TextDoc, toSpace, "/")
TextDoc <- tm_map(TextDoc, toSpace, "@")
TextDoc <- tm_map(TextDoc, toSpace, "\\|")
# Convert the text to lower case
TextDoc <- tm_map(TextDoc, content_transformer(tolower))
# Remove numbers
TextDoc <- tm_map(TextDoc, removeNumbers)
# Remove english common stopwords
TextDoc <- tm_map(TextDoc, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc <- tm_map(TextDoc, removeWords, c("s", "company", "team")) 
# Remove punctuations
TextDoc <- tm_map(TextDoc, removePunctuation)
# Eliminate extra white spaces
TextDoc <- tm_map(TextDoc, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc <- tm_map(TextDoc, stemDocument)


text_news <- readLines(file.choose('en_US.news.txt'))
text_news <- iconv(text_news,"latin1","ASCII",sub = "")

text_news1 <- text_news

TextDoc_new <- Corpus(VectorSource(text_news1))

#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc_new <- tm_map(TextDoc_new, toSpace, "/")
TextDoc_new <- tm_map(TextDoc_new, toSpace, "@")
TextDoc_new <- tm_map(TextDoc_new, toSpace, "\\|")
# Convert the text to lower case
TextDoc_new <- tm_map(TextDoc_new, content_transformer(tolower))
# Remove numbers
TextDoc_new <- tm_map(TextDoc_new, removeNumbers)
# Remove english common stopwords
TextDoc_new <- tm_map(TextDoc_new, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc_new <- tm_map(TextDoc_new, removeWords, c("s", "company", "team")) 
# Remove punctuations
TextDoc_new <- tm_map(TextDoc_new, removePunctuation)
# Eliminate extra white spaces
TextDoc_new <- tm_map(TextDoc_new, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc_new <- tm_map(TextDoc_new, stemDocument)


text_twitter <- readLines(file.choose())
text_twitter <- iconv(text_twitter,"latin1","ASCII",sub = "")
factor <- 0.1
text_twitter1 <- sample(text_twitter,round(factor*length(text_twitter)))

# Load the data as a corpus
TextDoc_twitter <- Corpus(VectorSource(text_twitter1))

#Replacing "/", "@" and "|" with space
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "/")
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "@")
TextDoc_twitter <- tm_map(TextDoc_twitter, toSpace, "\\|")
# Convert the text to lower case
TextDoc_twitter <- tm_map(TextDoc_twitter, content_transformer(tolower))
# Remove numbers
TextDoc_twitter <- tm_map(TextDoc_twitter, removeNumbers)
# Remove english common stopwords
TextDoc_twitter <- tm_map(TextDoc_twitter, removeWords, stopwords("english"))
# Remove your own stop word
# specify your custom stopwords as a character vector
TextDoc_twitter <- tm_map(TextDoc_twitter, removeWords, c("s", "company", "team")) 
# Remove punctuations
TextDoc_twitter <- tm_map(TextDoc_twitter, removePunctuation)
# Eliminate extra white spaces
TextDoc_twitter <- tm_map(TextDoc_twitter, stripWhitespace)
# Text stemming - which reduces words to their root form
TextDoc_twitter <- tm_map(TextDoc_twitter, stemDocument)

Analogical code is applied for the news and twitter files

#Tokenization The data is split into separate words and converted to term matrices

text_blog1_tokens<- tokenize_words(text_blog1)

text_news1_tokens<- tokenize_words(text_news1)

text_twitter1_tokens<- tokenize_words(text_twitter1)



TextDoc_dtm <- TermDocumentMatrix(TextDoc)
TextDoc_dtm_news <- TermDocumentMatrix(TextDoc_new)
TextDoc_dtm_twit <- TermDocumentMatrix(TextDoc_twitter)

#Data exploration

Now we can see the most frequent positive and negative words

For the Blog data

TextDoc_dtm_td <- tidy(TextDoc_dtm)

TextDoc_dtm_sentiments <- TextDoc_dtm_td %>%
        inner_join(get_sentiments("bing"), by = c(term = "word"))


TextDoc_dtm_sentiments <- TextDoc_dtm_sentiments %>% arrange(desc(count))


TextDoc_dtm_sentiments %>%
        count(document, sentiment, wt = count) %>%
        spread(sentiment, n, fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        arrange(sentiment)

## # A tibble: 57,978 x 4
##    document negative positive sentiment
##    <chr>       <dbl>    <dbl>     <dbl>
##  1 39933          29        2       -27
##  2 72641          21        1       -20
##  3 21443          19        1       -18
##  4 26864          25        7       -18
##  5 44638          16        0       -16
##  6 11660          15        0       -15
##  7 45887          16        1       -15
##  8 5008           17        2       -15
##  9 56153          18        3       -15
## 10 1763           18        4       -14
## # ... with 57,968 more rows

TextDoc_dtm_sentiments %>%
        count(sentiment, term, wt = count) %>%
        filter(n >= 1000) %>%
        mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
        mutate(term = reorder(term, n)) %>%
        ggplot(aes(term, n, fill = sentiment)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ylab("Contribution to sentiment")

wordcloud(words = TextDoc_dtm_sentiments$term, freq = TextDoc_dtm_sentiments$count, min.freq=4, 
          max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))

For the news data

TextDoc_dtm_td_bigram <- TextDoc_dtm_td %>% unnest_tokens(bigram, term, token = "ngrams", n = 2)
TextDoc_dtm_td_bigram_count <- TextDoc_dtm_td_bigram %>% count(bigram, sort = TRUE)
TextDoc_dtm_td_bigram_filtered <- TextDoc_dtm_td_bigram %>%
        filter(!is.na(bigram)) 

TextDoc_dtm_news_td <- tidy(TextDoc_dtm_news)

TextDoc_dtm_news_sentiments <- TextDoc_dtm_news_td %>%
        inner_join(get_sentiments("bing"), by = c(term = "word"))

head(TextDoc_dtm_news_sentiments)

## # A tibble: 6 x 4
##   term    document count sentiment
##   <chr>   <chr>    <dbl> <chr>    
## 1 die     2            1 negative 
## 2 applaud 3            1 positive 
## 3 hot     3            1 positive 
## 4 loss    3            1 negative 
## 5 fall    4            1 negative 
## 6 lead    4            1 positive

TextDoc_dtm_news_sentiments <- TextDoc_dtm_news_sentiments %>% arrange(desc(count))


TextDoc_dtm_news_sentiments %>%
        count(document, sentiment, wt = count) %>%
        spread(sentiment, n, fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        arrange(sentiment)

## # A tibble: 48,978 x 4
##    document negative positive sentiment
##    <chr>       <dbl>    <dbl>     <dbl>
##  1 42426          10        0       -10
##  2 10185           9        0        -9
##  3 23329           9        0        -9
##  4 36634           9        0        -9
##  5 46812          13        4        -9
##  6 64725           9        0        -9
##  7 1016            9        1        -8
##  8 19105          10        2        -8
##  9 25548           8        0        -8
## 10 27353           8        0        -8
## # ... with 48,968 more rows

TextDoc_dtm_news_sentiments %>%
        count(sentiment, term, wt = count) %>%
        filter(n >= 1000) %>%
        mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
        mutate(term = reorder(term, n)) %>%
        ggplot(aes(term, n, fill = sentiment)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ylab("Contribution to sentiment")

wordcloud(words = TextDoc_dtm_news_sentiments$term, freq = TextDoc_dtm_news_sentiments$count, min.freq=4, 
          max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))

And for the twitter data

TextDoc_dtm_twit_td <- tidy(TextDoc_dtm_twit)

TextDoc_dtm_twit_sentiments <- TextDoc_dtm_twit_td %>%
        inner_join(get_sentiments("bing"), by = c(term = "word"))

head(TextDoc_dtm_twit_sentiments)

## # A tibble: 6 x 4
##   term     document count sentiment
##   <chr>    <chr>    <dbl> <chr>    
## 1 greatest 2            1 positive 
## 2 love     3            1 positive 
## 3 thank    3            1 positive 
## 4 lose     5            1 negative 
## 5 bad      6            1 negative 
## 6 critic   7            1 negative

TextDoc_dtm_twit_sentiments <- TextDoc_dtm_twit_sentiments %>% arrange(desc(count))


TextDoc_dtm_twit_sentiments %>%
        count(document, sentiment, wt = count) %>%
        spread(sentiment, n, fill = 0) %>%
        mutate(sentiment = positive - negative) %>%
        arrange(sentiment)

## # A tibble: 132,596 x 4
##    document negative positive sentiment
##    <chr>       <dbl>    <dbl>     <dbl>
##  1 8604           11        0       -11
##  2 13546          10        0       -10
##  3 76909           9        0        -9
##  4 192594          8        0        -8
##  5 222110          8        0        -8
##  6 231253          8        0        -8
##  7 110791          7        0        -7
##  8 124944          7        0        -7
##  9 157696          7        0        -7
## 10 169574          7        0        -7
## # ... with 132,586 more rows

TextDoc_dtm_twit_sentiments %>%
        count(sentiment, term, wt = count) %>%
        filter(n >= 1000) %>%
        mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
        mutate(term = reorder(term, n)) %>%
        ggplot(aes(term, n, fill = sentiment)) +
        geom_bar(stat = "identity") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        ylab("Contribution to sentiment")

wordcloud(words = TextDoc_dtm_twit_sentiments$term, freq = TextDoc_dtm_twit_sentiments$count, min.freq=4, 
          max.words = 500, random.order = FALSE, scale = c(3, 0.5), colors = rainbow(3))

Capestone Data Explore

D.Dashinov

3/15/2021

Load necessery packages

Load data

Load the data as a corpus and basic text preparation