Text Mining Social Media Data

Making the dataset

#read social media message data into dataset 
data <- read.csv("messages1.csv")
#head(data)

#select column called CONTENT from social media dataset and make it into a unique dataset 
select_column_content<- data %>% select(CONTENT)
#select_column_content

#use unnest_tokens() to put one word on each line 
tidy_d<- select_column_content %>%
  unnest_tokens(word, CONTENT )
#tidy_d
#column name "word" is an unnested column of all words

# Remove stop words
without_stop_words <- tidy_d %>% anti_join(stop_words)

library(dplyr)
# Use dplyr count() function to find the most common words in the books:

numbered <- without_stop_words %>% count(word)

Making a frequency bar chart and word clouds

# The word are now stored in a tidy data frame, which allows us to pipe this directly to the ggplot2, 
# to create a visualization of the most common words for instance
# This creates a bar chart of the top words appearing in the dataset 
numbered %>% filter(n > 100) %>% ggplot(aes(reorder(word, n), n)) +
 geom_bar(stat = "identity") + coord_flip()

numbered %>% filter(n > 200) %>% ggplot(aes(reorder(word, n), n)) +
  geom_bar(stat = "identity") + coord_flip()

#Makes a word cloud of the top words appearing in the dataset 
wordcloud(words = numbered$word, freq =numbered$n, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35)

Using sentiment analysis

# Sentiment Analysis

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")


innerjtables <- merge(numbered, nrc_joy, by = "word")

#innerjtables

innerjtables %>% filter(n > 50) %>% ggplot(aes(reorder(word, n), n)) +
  geom_bar(stat = "identity") + coord_flip()

#word cloud of joy words
wordcloud(words = innerjtables$word, freq = innerjtables$n, min.freq = 1,
          max.words=250, random.order=FALSE, rot.per=0.35,)

#use bing lexicon to find the sentiments of words in the dataset 
bingsent<-get_sentiments("bing")
mergedtobing <- merge(bingsent, numbered, by = "word")
#sentiment_breakdown<- count(mergedtobing$sentiment)


#analyze_sentiment<- mergedtobing %>% 
#  group_by(word) %>%
#  spread(sentiment, freq, fill = 0) %>%
#  mutate(sentimentchange = positive - negative)


# Sentiment comparison word cloud
#install.packages("reshape2")
#library(reshape2)

#working sentiment analysis word cloud 

mergedtobing %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("red", "green4"),
                   max.words = 100)

Text Mining Social Media Data

Amy Shah