Tweets Preprocessing

#library(twitteR)
#library(httr)
#library(base64enc)
library(jsonlite)
library(stringr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)
library(XML)
library(RCurl)

## Loading required package: bitops

library(methods)
library(tm)

## Loading required package: NLP

library(wordcloud)

## Loading required package: RColorBrewer

library(tidytext)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

The CSV files containin the Twitter data were uploade to Github.

The following function that creates a vector with all links to be accessed to retrieve data.

start_url <- "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet"
end_url <- ".csv"
# the number of the files to be used in this project
vec <- seq(67,177)
# function
pages <- function(vec){
        n <- length(vec)
        urls <- vector('character')
        for (i in 1:n){
                temp <- str_c(start_url,vec[i],end_url, collapse = "")
                urls <- c(urls, temp)
        }
        return(urls)
}
urls <-pages(vec)
head(urls)

## [1] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet67.csv"
## [2] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet68.csv"
## [3] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet69.csv"
## [4] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet70.csv"
## [5] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet71.csv"
## [6] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet72.csv"

The urls created will be used to open connections to the files, read them into data frames, select the columns of interest and bind the data frames.

n <-length(urls)
Stream <-data.frame()
for (i in 1:n){
        csvfile <- url(urls[i])
        df <- read.csv(csvfile,header = TRUE, fileEncoding = "ASCII", stringsAsFactors = FALSE)
        df <- df %>% select(results.created_at,results.text,results.user.name,results.user.location)
        Stream <- rbind(Stream,df)
}
str(Stream)

## 'data.frame':    11072 obs. of  4 variables:
##  $ results.created_at   : chr  "Sat Nov 10 01:25:50 +0000 2018" "Sat Nov 10 01:25:31 +0000 2018" "Sat Nov 10 01:24:59 +0000 2018" "Sat Nov 10 01:24:31 +0000 2018" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...

For our analysis, we are interest in the tweets sent by people other than the candidates themselves. So we exclude the tweets for which the user name are of the candidates.

Stream <- Stream %>% filter(!results.user.name %in% c("Max Rose","Dan Donovan"))
str(Stream)

## 'data.frame':    11037 obs. of  4 variables:
##  $ results.created_at   : chr  "Sat Nov 10 01:25:50 +0000 2018" "Sat Nov 10 01:25:31 +0000 2018" "Sat Nov 10 01:24:59 +0000 2018" "Sat Nov 10 01:24:31 +0000 2018" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...

# Add row numbers and move to the front of the data frame
Stream <- Stream %>% mutate(id = row_number()) %>% select(id, everything())

# Change format of dates
# I don't know if we need to keep it. If so, we replace the original column
Stream$results.created_at <- as.POSIXct(Stream$results.created_at, format = "%a %b %d %H:%M:%S +0000 %Y")
str(Stream)

## 'data.frame':    11037 obs. of  5 variables:
##  $ id                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ results.created_at   : POSIXct, format: "2018-11-10 01:25:50" "2018-11-10 01:25:31" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...

# Not sure why to do this so for now I commented this line of code.
#combined_doc <- iconv(Stream, "UTF-8", "ASCII", sub = "")
#str(combined_doc)

Tweets Cleaning

The next steps is to cleanse the texts in the tweets by:

Removing Twitter handles (@user)
Removing punctuations, numbers, and special characters
Removing white spaces (?) and stop words
Remove hashtags, tags, urls, Twitter short words, etc.
Converting the corpus to lower case (?)

Mycorpus <- Corpus(VectorSource(Stream$results.text))
#Various cleansing funtions:
#ASCII Symbols
remove_ASCIIs <- function(x) gsub("[^\x01-\x7F]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_ASCIIs))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_ASCIIs)): transformation drops documents

#Retweets
remove_RTs <- function(x) gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_RTs))

## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(remove_RTs)):
## transformation drops documents

#@'s 
remove_ATs <- function(x) gsub("@\\w+", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_ATs))

## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(remove_ATs)):
## transformation drops documents

#All Punctuations
remove_Puncts <- function(x) gsub("[[:punct:]]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_Puncts))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_Puncts)): transformation drops documents

#All Digits
remove_Digits <- function(x) gsub("[[:digit:]]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_Digits))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_Digits)): transformation drops documents

#3-Step HTTP Process
remove_HTTPSs <- function(x) gsub("http\\w+", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs)): transformation drops documents

remove_HTTPSs2 <- function(x) gsub("[ \t]{2,}", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs2))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs2)): transformation drops documents

remove_HTTPSs3 <- function(x) gsub("^\\s+|\\s+$", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs3))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs3)): transformation drops documents

#Whitespaces
remove_WhiteSpace <- function(x) gsub("[ \t]{2,}", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_WhiteSpace))

## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_WhiteSpace)): transformation drops documents

#stopwards
Mycorpus <- tm_map(Mycorpus, removeWords,stopwords("en"))

## Warning in tm_map.SimpleCorpus(Mycorpus, removeWords, stopwords("en")):
## transformation drops documents

#Lower Case
Mycorpus <- tm_map(Mycorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(tolower)):
## transformation drops documents

dtm <- DocumentTermMatrix(Mycorpus)
suppressWarnings(wordcloud(Mycorpus, random.order=F, scale=c(3, 0.5), min.freq = 5, col=rainbow(50)))

Sentiment Analysis

Replace Twitter’s texts in original data frame with the clean data from corpus.

# Original data frame
head(Stream, n=10)

##    id  results.created_at
## 1   1 2018-11-10 01:25:50
## 2   2 2018-11-10 01:25:31
## 3   3 2018-11-10 01:24:59
## 4   4 2018-11-10 01:24:31
## 5   5 2018-11-10 01:23:35
## 6   6 2018-11-10 01:22:26
## 7   7 2018-11-10 01:21:15
## 8   8 2018-11-10 01:20:47
## 9   9 2018-11-10 01:20:09
## 10 10 2018-11-10 01:19:45
##                                                                                                                                                      results.text
## 1  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 2  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 3  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 4  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 5  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 6  RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
## 7                                                                                                                                @crhousel @MaxRose4NY \U0001f600
## 8                                                                                                            @Prometheus_2018 @MaxRose4NY :) \U0001f44b\U0001f3fd
## 9                                                                  RT @momanger3333: @KatieVasquezTV @MaxRose4NY A truly humble man. Now that’s a public servant!
## 10 RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Change is Refreshing!! Welcome to the #BlueHousePar…
##                                                                                              results.user.name
## 1  Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 2                                                                                               Pennell Somsen
## 3                                                                                 Barbara Ward #FBR \U0001f30a
## 4                                                                                                Randy #RESIST
## 5                                                                                                    Uncle Sam
## 6                                                                    Enma \U0001f1fa\U0001f1f8\U0001f30a Votes
## 7                   Steve Prometheus 2020 \U0001f985\U0001f525\U0001f4a5\U0001f44a\U0001f3fc #WeAreThePatriots
## 8                                                                                  \u26a1️StarfireResists\u26a1️
## 9                                                                                       Wanda Montalvo PhD, RN
## 10                                                                                             Storm #Blue2020
##                 results.user.location
## 1                                <NA>
## 2  Mérida, Yucatán & Harlem, New York
## 3                  New Hampshire, USA
## 4                                <NA>
## 5                                <NA>
## 6                     Eastern Pacific
## 7      #GeeksResistHQ, Long Island NY
## 8                    Geeks Resist, HQ
## 9                            New York
## 10                 GeeksResist HQ, TN

# Clean corpus
df <- data.frame(text = get("content", Mycorpus))
head(df, n=10)

##                                                                                         text
## 1  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 2  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 3  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 4  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 5  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 6  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 7                                                                                           
## 8                                                                                           
## 9                                               a truly humble man now thats  public servant
## 10 celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar

Stream$results.text <- as.character(df$text)
head(Stream, n=10)

##    id  results.created_at
## 1   1 2018-11-10 01:25:50
## 2   2 2018-11-10 01:25:31
## 3   3 2018-11-10 01:24:59
## 4   4 2018-11-10 01:24:31
## 5   5 2018-11-10 01:23:35
## 6   6 2018-11-10 01:22:26
## 7   7 2018-11-10 01:21:15
## 8   8 2018-11-10 01:20:47
## 9   9 2018-11-10 01:20:09
## 10 10 2018-11-10 01:19:45
##                                                                                 results.text
## 1  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 2  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 3  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 4  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 5  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 6  celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
## 7                                                                                           
## 8                                                                                           
## 9                                               a truly humble man now thats  public servant
## 10 celebratings victory nyyour passion  grassroots change  refreshing welcome   bluehousepar
##                                                                                              results.user.name
## 1  Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 2                                                                                               Pennell Somsen
## 3                                                                                 Barbara Ward #FBR \U0001f30a
## 4                                                                                                Randy #RESIST
## 5                                                                                                    Uncle Sam
## 6                                                                    Enma \U0001f1fa\U0001f1f8\U0001f30a Votes
## 7                   Steve Prometheus 2020 \U0001f985\U0001f525\U0001f4a5\U0001f44a\U0001f3fc #WeAreThePatriots
## 8                                                                                  \u26a1️StarfireResists\u26a1️
## 9                                                                                       Wanda Montalvo PhD, RN
## 10                                                                                             Storm #Blue2020
##                 results.user.location
## 1                                <NA>
## 2  Mérida, Yucatán & Harlem, New York
## 3                  New Hampshire, USA
## 4                                <NA>
## 5                                <NA>
## 6                     Eastern Pacific
## 7      #GeeksResistHQ, Long Island NY
## 8                    Geeks Resist, HQ
## 9                            New York
## 10                 GeeksResist HQ, TN

is.data.frame(Stream)

## [1] TRUE

Streamnew <- Stream %>%  
  unnest_tokens(word, results.text)

  head(Streamnew)

##     id  results.created_at
## 1    1 2018-11-10 01:25:50
## 1.1  1 2018-11-10 01:25:50
## 1.2  1 2018-11-10 01:25:50
## 1.3  1 2018-11-10 01:25:50
## 1.4  1 2018-11-10 01:25:50
## 1.5  1 2018-11-10 01:25:50
##                                                                                               results.user.name
## 1   Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 1.1 Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 1.2 Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 1.3 Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 1.4 Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
## 1.5 Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd
##     results.user.location         word
## 1                    <NA> celebratings
## 1.1                  <NA>      victory
## 1.2                  <NA>       nyyour
## 1.3                  <NA>      passion
## 1.4                  <NA>   grassroots
## 1.5                  <NA>       change

Streamnew %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")

## Selecting by n

## Selecting by n

# join sentiment classification to the tweet words
bing_word_counts <- Streamnew %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

## Joining, by = "word"

bing_word_counts %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(title = "Sentiment Primary Election Sentiment.",
       y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

## Selecting by n

Check packages SentimentAnalysis and sentimentr. Check this for a good and simple example of sentiment analysis https://www.earthdatascience.org/courses/earth-analytics/get-data-using-apis/sentiment-analysis-of-twitter-data-r/

# Emotions for each tweet using NRC dictionary
library(syuzhet)

dtm_td <- tidy(dtm)

#emotions <- get_nrc_sentiment(dtm_td$term)
#emo_bar = colSums(emotions)
#emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
#emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])

Reference

Jeff Gentry. “twitteR - Twitter client for R.” March 18, 2014. R package version 1.1.9. https://www.rdocumentation.org/packages/twitteR/versions/1.1.9
hupseb. “How to use premium API for the first time (beginner)?” Post #10, May 13, 2018. Twitter Developers Forums. https://twittercommunity.com/t/how-to-use-premium-api-for-the-first-time-beginner/105346/10
Hadley Wickham. “Best practices for API packages”. Aug 20, 2017. R package httr Vignette. https://cran.r-project.org/web/packages/httr/vignettes/api-packages.html

data607finalproject

Tweets Preprocessing

Tweets Cleaning

Sentiment Analysis

Reference