Description:

An analysis of tweets under the #metoo hashtag, collected between December 1 and December 31, 2017.

#install.packages("twitteR")
#install.packages("ROAuth")
#install.packages("httr")
library(twitteR)
library(ROAuth)
library(httr)
library(plyr)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(magrittr)
library(tidytext)
library(wordcloud)
library(tidyr)
library(reshape2)
library(devtools)
library(widyr)
library(igraph)
library(ggraph)
library(stringr)
library(readr)

Scrape Twitter for Tweets with #metoo hashtag

Define API Keys & Authenticate through Twitter app
api_key <- "*******"
api_secret <- "*******"
access_token <- "*******"
access_token_secret <- "*******"

setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
Download tweets containing #metoo Hashtag on Nov 13, 2015. Save as .csv.
metoo_tweets <- twListToDF(searchTwitter("#metoo", 
                            n=20000, 
                            lang="en",
                            since="2017-12-01",
                            until="2017-12-31"))

#                            subset(select=c("id","text","created",
#                                            "screenName","retweetCount",
#                                            "isRetweet","retweeted"))



#Save metoo_tweets to .csv
#write.csv(metoo_tweets,"/Users/Brett/Desktop/metoo/metoo_12_23_17.csv")
Load Data
metoo_tweets<-read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/QC Teaching/Tweet Archive/metoo/metoo_tweets_dec2017.csv")
#Take a random sample of the 300K tweets in this dataset. 
metoo_tweets<-sample_n(metoo_tweets, 50000)
Remove Emoji Characters
metoo_tweets$text <- sapply(metoo_tweets$text, function(row) iconv(row, "latin1", "ASCII", sub=""))
Separate each individual word into a row of its own.
        metootext <- metoo_tweets %>%
                     unnest_tokens(word, text)
Create a list of customs stop words.
data(stop_words)
custom_stop_words <- bind_rows(
                    data_frame(word = c("https","rt","t.co"), 
                              lexicon = c("custom")), 
                              stop_words)
Remove stop words from twitter data.
metootext <- metootext %>%
             anti_join(custom_stop_words, by="word")
How many words are we working with now?
[1] 475470
Plot Most Frequently Mentioned Words
#Plot most frequent words
detach("package:plyr", unload=TRUE)
‘plyr’ namespace cannot be unloaded:
  namespace ‘plyr’ is imported by ‘ggraph’, ‘ggplot2’, ‘scales’, ‘broom’, ‘reshape2’ so cannot be unloaded
library(dplyr)
metootext %>%
          filter(isRetweet == FALSE)%>%
          count(word, sort = TRUE) %>%
          filter(n > 500) %>%
          mutate(word = reorder(word, n)) %>%
          ggplot(aes(word, n)) +
                  geom_col(fill="lightblue") +
                        xlab("Words")+
                        ylab("# of Mentions | April 4th & 5th") +
                        coord_flip()+
                  theme_light()+
                  theme(text = element_text(size=10))+
                  geom_text(aes(x=word, y=n, label = n),
                      check_overlap = FALSE, size=3, hjust=1)

Wordcloud of Most Frequently Mentioned Words
metootextcloud <- metootext %>%
          filter(isRetweet == FALSE)%>%
          count(word, sort = TRUE) %>%
          filter(n > 40) %>%
          mutate(word = reorder(word, n))
          wordcloud(words = metootextcloud$word, freq = metootextcloud$n, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

Sentiment Analysis

Evaluate Sentiment Using NRC Emotion Lexicon for word-sentiment associations.

Plot most common sentiments
Note: words are counted once for every sentiment they are associated with
metootext %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  ungroup() %>%
  mutate(sentiment = reorder(sentiment, n))%>%
  ggplot(aes(sentiment, n)) +
      geom_col(fill = "lightblue", show.legend = FALSE) +
          labs(y = "Number of Occurences",x = NULL) +
  coord_flip()+
      geom_text(aes(x=sentiment, y=n, label = n),
                    check_overlap = FALSE, 
                    size=3, 
                    angle=0, 
                    hjust=1.5)+
      theme_light()+    
      theme(text = element_text(size=14))

Plot most common words, by sentiment
metootext %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
      geom_col(show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y", nrow=4) +
          labs(y = "Number of Occurences",
          x = NULL) +
      coord_flip()+
      theme_light()+
      geom_text(aes(x=word, y=n, label = n),
                    check_overlap = FALSE, size=7, angle=0, hjust=.75)+
                    theme(text = element_text(size=24))

In what context are people using the word “powerful”?

metoo_tweets%>%
  filter(str_detect(text, "powerful"))%>%
  subset(select=c("text"))%>%
  print(row.names=FALSE)

Positive-Negative Sentiment Analysis

Using Bing Lexicon for positive-negative word classifications.

Total Number of Positive / Negative Words

metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  ungroup() %>%
  ggplot(aes(sentiment, n, fill = sentiment)) +
      geom_bar(show.legend = FALSE, stat="identity") +
          labs(y = "Number of Occurences",x = NULL) +
      geom_text(aes(x=sentiment, y=n, label = n),
                    check_overlap = FALSE, 
                    size=3, 
                    angle=0, 
                    vjust=1.5)+
      theme_light()+    
      theme(text = element_text(size=8),
              axis.text.y = element_blank())

Most frequently mentioned Positive / Negative Words

metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
      geom_col(show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y") +
          labs(y = "Number of Occurences",
          x = NULL) +
      coord_flip()+
      theme_light()+
      geom_text(aes(x=word, y=n, label = n),
                    check_overlap = FALSE, size=3, angle=0, hjust=1)+
                    theme(text = element_text(size=16))

Positive / Negative Wordcloud

metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(100) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0)%>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100)

Context of Positive / Negative Words

metoo_tweets%>%
  filter(str_detect(text, "silent"))%>%
  subset(select=c("text"))%>%
  print("text")

Word Pairings

Individual words may lend us some understanding of the kinds of language being used, but may not give us as much context as we may need to develop a qualitative understanding of the text. Looking at word pairings may help us see how terms are related to one another.

Which words are frequently paired together?

#install_github("dgrtwo/widyr")
#install.packages("igraph")
#install.packages("ggraph")
#library("devtools")
#library(widyr)
#library(igraph)
#library(ggraph)
#Generating all occuring word combinations (84 million combinations of 475,000 words)
metootextweb <- metootext %>% 
  pairwise_count(word, id, sort = TRUE, upper = FALSE)
metootextweb %>%
  filter(n >= 400) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()

The word web above displays fequency of word pairings. This gives an unfair advantage to those words that are used more frequently, but may not be consistently paired together as frequently as other words. To level the playing field for word pairings, we can look at the correlational value between two words. It is possible that words which show up less frequently hold stronger correlations.

Which words are highly correlated?

metootextweb2 <- metootext %>% 
  group_by(word) %>%
  filter(n() >= 800) %>%
  pairwise_cor(word, id, sort = TRUE, upper = FALSE)
metootextweb2 %>%
  filter(correlation > .2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), 
                 edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Which words are highly correlated? (stop words included)

metootext2 <- metoo_tweets %>%
                     unnest_tokens(word, text)
metootextweb3 <- metootext2 %>% 
  group_by(word) %>%
  filter(n() >= 1000) %>%
  pairwise_cor(word, id, sort = TRUE, upper = FALSE)
metootextweb3 %>%
  filter(correlation > .2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), 
                 edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

Conclusion & Notes

Through this analysis we are able to identify the kind of language being used to describe the issue, the proportion of positive-negative words that characterize the expressions of twitter users, and the relationships between words which occur frequently and consistently in pairs.

Newsjacking, otherwise labeled in this context “opinion spam”, is an issue which distorts the analysis. The #metoo hashtag is intended as a content classifier, and the unrelated content that appears within this segment of tweets contributes unrelated data into the sentiment measurements.

It is also important to note that sentiment analysis can tell you about the sentiment of the language being used by a person, but not neccesarily about the individuals perspective or leaning on an issue.

Sentiment analysis of individual words may give insight into the tone of a statement, but word clouds, word webs, and +/- language timelines may offer more insight into the substantive meaning of the text.

Emoji characters offer valuable sentiment information, but are unfortunately left out of this analysis. Decrypting and incorporating emojis into such text analysis would be an important addition to the qualitative content of a text analysis report.

---
title: ""
output: html_notebook
---
![](/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Employers/Resume/MDRC/MeToo.png)

##Description:

An analysis of tweets under the #metoo hashtag, collected between December 1 and December 31, 2017.

```{r, echo=TRUE, message=FALSE, warning=FALSE}
#install.packages("twitteR")
#install.packages("ROAuth")
#install.packages("httr")
library(twitteR)
library(ROAuth)
library(httr)
library(plyr)
library(ggplot2)
library(ggthemes)
library(dplyr)
library(magrittr)
library(tidytext)
library(wordcloud)
library(tidyr)
library(reshape2)
library(devtools)
library(widyr)
library(igraph)
library(ggraph)
library(stringr)
library(readr)
```

```{r, eval=FALSE, include=FALSE}
# Set API Keys
api_key <- "K0xtA0LP1xo9ck3mY0iTlTe1W"
api_secret <- "Loi2W8Gv8QGka0rVlSovT3sx8UpeGiUL03FLxR0uxi3K8BXkn2"
access_token <- "290683158-FkEDtX1BbT2nKNpQ6LbJs1UJ4EzAOon8MLc384CF"
access_token_secret <- "R9N8H7AOYYO6P212QP20dUlojMWay2Q9JM9HjP8BdG00o"

setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
```


##Scrape Twitter for Tweets with **#metoo** hashtag


##### Define API Keys & Authenticate through Twitter app
```{r, eval=FALSE, include=TRUE}
api_key <- "*******"
api_secret <- "*******"
access_token <- "*******"
access_token_secret <- "*******"

setup_twitter_oauth(api_key, api_secret, access_token, access_token_secret)
```


##### Download tweets containing **#metoo** Hashtag on Nov 13, 2015. Save as .csv.
```{r, eval=FALSE, include=TRUE}
metoo_tweets <- twListToDF(searchTwitter("#metoo", 
                            n=20000, 
                            lang="en",
                            since="2017-12-01",
                            until="2017-12-31"))

#                            subset(select=c("id","text","created",
#                                            "screenName","retweetCount",
#                                            "isRetweet","retweeted"))



#Save metoo_tweets to .csv
#write.csv(metoo_tweets,"/Users/Brett/Desktop/metoo/metoo_12_23_17.csv")
```

#####Load Data
```{r, message=FALSE, warning=FALSE}
metoo_tweets<-read_csv("/Users/Brett/Library/Mobile Documents/com~apple~CloudDocs/All Files/Education/QC Teaching/Tweet Archive/metoo/metoo_tweets_dec2017.csv")

#Take a random sample of the 300K tweets in this dataset. 
metoo_tweets<-sample_n(metoo_tweets, 50000)
```


#####Remove Emoji Characters
```{r}
metoo_tweets$text <- sapply(metoo_tweets$text, function(row) iconv(row, "latin1", "ASCII", sub=""))
```

#####Separate each individual word into a row of its own.
```{r}
        metootext <- metoo_tweets %>%
                     unnest_tokens(word, text)
```

#####Create a list of customs stop words.

```{r}
data(stop_words)
custom_stop_words <- bind_rows(
                    data_frame(word = c("https","rt","t.co"), 
                              lexicon = c("custom")), 
                              stop_words)
```

#####Remove stop words from twitter data.

```{r}
metootext <- metootext %>%
             anti_join(custom_stop_words, by="word")
```

#####How many words are we working with now?
```{r, echo=FALSE}
nrow(metootext)
```        

#####Plot Most Frequently Mentioned Words
```{r}
#Plot most frequent words
detach("package:plyr", unload=TRUE)
library(dplyr)
metootext %>%
          filter(isRetweet == FALSE)%>%
          count(word, sort = TRUE) %>%
          filter(n > 500) %>%
          mutate(word = reorder(word, n)) %>%
          ggplot(aes(word, n)) +
                  geom_col(fill="lightblue") +
                        xlab("Words")+
                        ylab("# of Mentions | April 4th & 5th") +
                        coord_flip()+
                  theme_light()+
                  theme(text = element_text(size=10))+
                  geom_text(aes(x=word, y=n, label = n),
                      check_overlap = FALSE, size=3, hjust=1)
```

#####Wordcloud of Most Frequently Mentioned Words
```{r}
metootextcloud <- metootext %>%
          filter(isRetweet == FALSE)%>%
          count(word, sort = TRUE) %>%
          filter(n > 40) %>%
          mutate(word = reorder(word, n))

          wordcloud(words = metootextcloud$word, freq = metootextcloud$n, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

```

#Sentiment Analysis

>Evaluate Sentiment Using [NRC Emotion Lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) for word-sentiment associations.

##### Plot most common sentiments

###### **Note:** *words are counted once for every sentiment they are associated with*
```{r, message=FALSE}
metootext %>%
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  ungroup() %>%
  mutate(sentiment = reorder(sentiment, n))%>%
  ggplot(aes(sentiment, n)) +
      geom_col(fill = "lightblue", show.legend = FALSE) +
          labs(y = "Number of Occurences",x = NULL) +
  coord_flip()+
      geom_text(aes(x=sentiment, y=n, label = n),
                    check_overlap = FALSE, 
                    size=3, 
                    angle=0, 
                    hjust=1.5)+
      theme_light()+    
      theme(text = element_text(size=14))

```

##### Plot most common words, by sentiment
```{r, fig.height=12, fig.width=8, message=FALSE}

metootext %>%
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
      geom_col(show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y", nrow=4) +
          labs(y = "Number of Occurences",
          x = NULL) +
      coord_flip()+
      theme_light()+
      geom_text(aes(x=word, y=n, label = n),
                    check_overlap = FALSE, size=7, angle=0, hjust=.75)+
                    theme(text = element_text(size=24))
```

####In what context are people using the word "powerful"?
```{r}
metoo_tweets%>%
  filter(str_detect(text, "powerful"))%>%
  subset(select=c("text"))%>%
  print(row.names=FALSE)
```




##Positive-Negative Sentiment Analysis

>Using [Bing Lexicon](https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) for positive-negative word classifications.


####Total Number of Positive / Negative Words
```{r, fig.height=1, fig.width=2, message=FALSE}
metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  ungroup() %>%
  ggplot(aes(sentiment, n, fill = sentiment)) +
      geom_bar(show.legend = FALSE, stat="identity") +
          labs(y = "Number of Occurences",x = NULL) +
      geom_text(aes(x=sentiment, y=n, label = n),
                    check_overlap = FALSE, 
                    size=3, 
                    angle=0, 
                    vjust=1.5)+
      theme_light()+    
      theme(text = element_text(size=8),
              axis.text.y = element_blank())
```

####Most frequently mentioned Positive / Negative Words

```{r, message=FALSE}
metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
      geom_col(show.legend = FALSE) +
          facet_wrap(~sentiment, scales = "free_y") +
          labs(y = "Number of Occurences",
          x = NULL) +
      coord_flip()+
      theme_light()+
      geom_text(aes(x=word, y=n, label = n),
                    check_overlap = FALSE, size=3, angle=0, hjust=1)+
                    theme(text = element_text(size=16))
```


####Positive / Negative Wordcloud
```{r, message=FALSE}
metootext %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()%>%
  group_by(sentiment) %>%
  top_n(100) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0)%>%
  comparison.cloud(colors = c("#F8766D", "#00BFC4"),
                   max.words = 100)
```

####Context of Positive / Negative Words

```{r, echo=TRUE}
metoo_tweets%>%
  filter(str_detect(text, "silent"))%>%
  subset(select=c("text"))%>%
  print("text")
```



##Word Pairings

>Individual words may lend us some understanding of the kinds of language being used, but may not give us as much context as we may need to develop a qualitative understanding of the text. Looking at word pairings may help us see how terms are related to one another.

####Which words are frequently paired together?

```{r, echo=TRUE}

#install_github("dgrtwo/widyr")
#install.packages("igraph")
#install.packages("ggraph")
#library("devtools")
#library(widyr)
#library(igraph)
#library(ggraph)

#Generating all occuring word combinations (84 million combinations of 475,000 words)
metootextweb <- metootext %>% 
  pairwise_count(word, id, sort = TRUE, upper = FALSE)

metootextweb %>%
  filter(n >= 400) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = n, edge_width = n), edge_colour = "darkred") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE, 
                 point.padding = unit(0.2, "lines")) +
  theme_void()
```


>The word web above displays fequency of word pairings. This gives an unfair advantage to those words that are used more frequently, but may not be consistently paired together as frequently as other words. To level the playing field for word pairings, we can look at the correlational value between two words. It is possible that words which show up less frequently hold stronger correlations. 

####Which words are highly correlated?
```{r, echo=TRUE}
metootextweb2 <- metootext %>% 
  group_by(word) %>%
  filter(n() >= 800) %>%
  pairwise_cor(word, id, sort = TRUE, upper = FALSE)

metootextweb2 %>%
  filter(correlation > .2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), 
                 edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()

```

####Which words are highly correlated? (stop words included)
```{r, echo=TRUE}
metootext2 <- metoo_tweets %>%
                     unnest_tokens(word, text)

metootextweb3 <- metootext2 %>% 
  group_by(word) %>%
  filter(n() >= 1000) %>%
  pairwise_cor(word, id, sort = TRUE, upper = FALSE)

metootextweb3 %>%
  filter(correlation > .2) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), 
                 edge_colour = "royalblue") +
  geom_node_point(size = 5) +
  geom_node_text(aes(label = name), repel = TRUE,
                 point.padding = unit(0.2, "lines")) +
  theme_void()
```
##Conclusion & Notes

Through this analysis we are able to identify the kind of language being used to describe the issue, the proportion of positive-negative words that characterize the expressions of twitter users, and the relationships between words which occur frequently and consistently in pairs.

Newsjacking, otherwise labeled in this context "opinion spam", is an issue which distorts the analysis. The #metoo hashtag is intended as a content classifier, and the unrelated content that appears within this segment of tweets contributes unrelated data into the sentiment measurements.

It is also important to note that sentiment analysis can tell you about the sentiment of the language being used by a person, but not neccesarily about the individuals perspective or leaning on an issue.

Sentiment analysis of individual words may give insight into the tone of a statement, but word clouds, word webs, and +/- language timelines may offer more insight into the substantive meaning of the text.

Emoji characters offer valuable sentiment information, but are unfortunately left out of this analysis. Decrypting and incorporating emojis into such text analysis would be an important addition to the qualitative content of a text analysis report.