1. Introduction

This is a short R Markdown that plots word clouds from abusers of MP Anna Soubry. This RMD is a follow-up to the previous document which details data collection methods.

2. Wordcloud From Active Abusers

2.1. From all tweets in the dataset

This is a large word cloud plotted from all tweets from active abusers. Check the code to see the stop words removed from the corpus.

library(tidyverse)
library(tm)
library(wordcloud)
library(RColorBrewer)

active <- read_csv("~/Documents/Work/Anna_Soubry/active-accounts/active_accounts_timelines.tweets.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   status_id = col_double(),
##   created_at = col_datetime(format = ""),
##   user_id = col_double(),
##   reply_to_status_id = col_double(),
##   reply_to_user_id = col_double(),
##   is_quote = col_logical(),
##   is_retweet = col_logical(),
##   favorite_count = col_integer(),
##   retweet_count = col_integer(),
##   quoted_status_id = col_double(),
##   retweet_status_id = col_double(),
##   protected = col_logical(),
##   followers_count = col_integer(),
##   friends_count = col_integer(),
##   listed_count = col_integer(),
##   statuses_count = col_integer(),
##   favourites_count = col_integer(),
##   account_created_at = col_datetime(format = ""),
##   verified = col_logical()
## )
## See spec(...) for full column specifications.
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt @\\w+:", "", x)
  # remove at
  x = gsub("http\\w+", "", x)
  # # remove links http
  x = gsub("@\\w+", "", x)
  # # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # # remove line ends
  x = gsub("\n", " ", x)
  # # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # # remove blank spaces at the end
  x = gsub(" $", "", x)
  
  # #remove stop words
  x= removeWords(x, c(stopwords("english"), "amp", "dont", "like", "just", "look", "looks", "well", "will", "can", "one"))
  return(x)
}

active <- active %>%
  mutate(tweet_cleaned= text %>% clean.text())

corpus <- tm::Corpus(VectorSource( active$tweet_cleaned))
wordcloud(corpus, min.freq = 2, random.color = FALSE, random.order=FALSE, rot.per=0.3, max.words = 400,
          colors=brewer.pal(8, "Dark2"))

This is a bit hard to interpret as it contains too much data. Therefore, I will create word clouds from each account individually below.

2.2. A wordcloud for each individual abuser

This code took a bit longer than expected. Although it’s easy to create one word cloud, it’s not that easy to group the data by users and create an individual word cloud for each abuser. Many bits and bobs that failed.

Please note that these word clouds include all tweets and retweets posted by active abusers. Also, most frequent 200 words were plotted for each user and words are positioned, coloured and sized based on word frequency rankings.

grouped <- active %>% group_by(user_id) %>% nest() 

try_function <- function(z){
# create corpus
a <- z  %>% select(tweet_cleaned) %>% distinct()
corpus <- tm::Corpus(VectorSource( a$tweet_cleaned))
tdm<- tm::TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

set.seed(2018)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(1, 4))
plot.new()
text(x=0.5, y=0.8, paste( "Wordcloud of Content Posted by @", z$screen_name, sep = ""))
wc <- wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.30, 
          colors=brewer.pal(8, "Dark2"))
return(print("next user"))

}

b <- map(grouped$data, try_function)

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"
return(print("next user"))
## [1] "next user"

3. Wordclouds From Suspended Abusers

3.1. From all mentions in the dataset

Below is a word cloud from all tweets which are identified to have mentioned the suspended abusers. Again, most frequent 200 words were plotted and words are positioned, coloured and sized based on word frequency rankings.

suspended <- read_csv("~/Documents/Work/Anna_Soubry/suspended_accounts/suspended_accounts_tweets_binded.csv") %>% 
  mutate( mentioned_abuser= case_when( str_detect(text, "IrateBrit")==T ~ "IrateBrit", 
                                       str_detect(text, "Mos__Maiorum")==T ~ "Mos__Maiorum",
                                       str_detect(text, "TudorRashoff")==T ~ "TudorRashoff",
                                       str_detect(text, "frottroilism")==T ~ "frottroilism",
                                       str_detect(text, "edge1959")==T ~ "edge1959",
                                       str_detect(text, "simonfield68")==T ~ "simonfield68",
                                       str_detect(text, "TerryNOTA60")==T ~ "TerryNOTA60",
                                       str_detect(text, "Km21M")==T ~ "Km21M")) %>%
  mutate(mentioned_abuser_2=mentioned_abuser ) %>%  # will use this as an index in map
  mutate(tweet_cleaned= text %>% clean.text())
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   status_id = col_double(),
##   created_at = col_datetime(format = ""),
##   user_id = col_double(),
##   reply_to_status_id = col_double(),
##   reply_to_user_id = col_double(),
##   is_quote = col_logical(),
##   is_retweet = col_logical(),
##   favorite_count = col_integer(),
##   retweet_count = col_integer(),
##   quoted_status_id = col_double(),
##   protected = col_logical(),
##   followers_count = col_integer(),
##   friends_count = col_integer(),
##   listed_count = col_integer(),
##   statuses_count = col_integer(),
##   favourites_count = col_integer(),
##   account_created_at = col_datetime(format = ""),
##   verified = col_logical()
## )
## See spec(...) for full column specifications.
corpus <- tm::Corpus(VectorSource( suspended$tweet_cleaned))
wordcloud(corpus, min.freq = 2, random.color = FALSE, random.order=FALSE, rot.per=0.3, max.words = 200,
          colors=brewer.pal(8, "Dark2"))

warnings()
## NULL

3.2. A wordcloud for mentions at each individual abuser

grouped_suspended <- suspended %>% group_by(mentioned_abuser) %>% nest()

try_function_2 <- function(x){
# create corpus
a <- x  %>% select(tweet_cleaned) %>% distinct()
corpus <- tm::Corpus(VectorSource( a$tweet_cleaned))
tdm<- tm::TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

set.seed(2018)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(1, 4))
plot.new()
text(x=0.5, y=0.8, paste( "Wordcloud of Tweets Which Mention @", x$mentioned_abuser_2, sep = ""))
wc <- wordcloud(words = d$word, freq = d$freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.30, 
          colors=brewer.pal(8, "Dark2"))
return(print("next user"))
}

c <- map(grouped_suspended$data, try_function_2) #using above defined function

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"

## [1] "next user"