Tweets Preprocessing

#library(twitteR)
#library(httr)
#library(base64enc)
library(jsonlite)
library(stringr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(XML)
library(RCurl)
## Loading required package: bitops
library(methods)
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate

The CSV files containin the Twitter data were uploade to Github.

The following function that creates a vector with all links to be accessed to retrieve data.

start_url <- "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet"
end_url <- ".csv"
# the number of the files to be used in this project
vec <- seq(67,177)
# function
pages <- function(vec){
        n <- length(vec)
        urls <- vector('character')
        for (i in 1:n){
                temp <- str_c(start_url,vec[i],end_url, collapse = "")
                urls <- c(urls, temp)
        }
        return(urls)
}
urls <-pages(vec)
head(urls)
## [1] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet67.csv"
## [2] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet68.csv"
## [3] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet69.csv"
## [4] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet70.csv"
## [5] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet71.csv"
## [6] "https://raw.githubusercontent.com/bsosnovski/FinalProject/master/tweet72.csv"

The urls created will be used to open connections to the files, read them into data frames, select the columns of interest and bind the data frames.

n <-length(urls)
Stream <-data.frame()
for (i in 1:n){
        csvfile <- url(urls[i])
        df <- read.csv(csvfile,header = TRUE, fileEncoding = "ASCII", stringsAsFactors = FALSE)
        df <- df %>% select(results.created_at,results.text,results.user.name,results.user.location)
        Stream <- rbind(Stream,df)
}
str(Stream)
## 'data.frame':    11072 obs. of  4 variables:
##  $ results.created_at   : chr  "Sat Nov 10 01:25:50 +0000 2018" "Sat Nov 10 01:25:31 +0000 2018" "Sat Nov 10 01:24:59 +0000 2018" "Sat Nov 10 01:24:31 +0000 2018" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...

For our analysis, we are interest in the tweets sent by people other than the candidates themselves. So we exclude the tweets for which the user name are of the candidates.

Stream <- Stream %>% filter(!results.user.name %in% c("Max Rose","Dan Donovan"))
str(Stream)
## 'data.frame':    11037 obs. of  4 variables:
##  $ results.created_at   : chr  "Sat Nov 10 01:25:50 +0000 2018" "Sat Nov 10 01:25:31 +0000 2018" "Sat Nov 10 01:24:59 +0000 2018" "Sat Nov 10 01:24:31 +0000 2018" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...
# Add row numbers and move to the front of the data frame
Stream <- Stream %>% mutate(id = row_number()) %>% select(id, everything())

# Change format of dates
# I don't know if we need to keep it. If so, we replace the original column
Stream$results.created_at <- as.POSIXct(Stream$results.created_at, format = "%a %b %d %H:%M:%S +0000 %Y")
str(Stream)
## 'data.frame':    11037 obs. of  5 variables:
##  $ id                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ results.created_at   : POSIXct, format: "2018-11-10 01:25:50" "2018-11-10 01:25:31" ...
##  $ results.text         : chr  "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ "RT @crhousel: \U0001f389\U0001f389Celebrating @MaxRose4NY ‘s Victory #NY11 !! Your Passion for Grassroots Chang"| __truncated__ ...
##  $ results.user.name    : chr  "Cate   Resist\U0001f44f\U0001f3feEvery\U0001f44f\U0001f3ffDamned\U0001f44f\U0001f3fcDay\U0001f44f\U0001f3fd" "Pennell Somsen" "Barbara Ward #FBR \U0001f30a" "Randy #RESIST" ...
##  $ results.user.location: chr  NA "Mérida, Yucatán & Harlem, New York" "New Hampshire, USA" NA ...
# Not sure why to do this so for now I commented this line of code.
#combined_doc <- iconv(Stream, "UTF-8", "ASCII", sub = "")
#str(combined_doc)

Tweets Cleaning

The next steps is to cleanse the texts in the tweets by:

Mycorpus <- Corpus(VectorSource(Stream$results.text))
#Various cleansing funtions:
#ASCII Symbols
remove_ASCIIs <- function(x) gsub("[^\x01-\x7F]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_ASCIIs))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_ASCIIs)): transformation drops documents
#Retweets
remove_RTs <- function(x) gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_RTs))
## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(remove_RTs)):
## transformation drops documents
#@'s 
remove_ATs <- function(x) gsub("@\\w+", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_ATs))
## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(remove_ATs)):
## transformation drops documents
#All Punctuations
remove_Puncts <- function(x) gsub("[[:punct:]]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_Puncts))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_Puncts)): transformation drops documents
#All Digits
remove_Digits <- function(x) gsub("[[:digit:]]", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_Digits))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_Digits)): transformation drops documents
#3-Step HTTP Process
remove_HTTPSs <- function(x) gsub("http\\w+", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs)): transformation drops documents
remove_HTTPSs2 <- function(x) gsub("[ \t]{2,}", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs2))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs2)): transformation drops documents
remove_HTTPSs3 <- function(x) gsub("^\\s+|\\s+$", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_HTTPSs3))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_HTTPSs3)): transformation drops documents
#Whitespaces
remove_WhiteSpace <- function(x) gsub("[ \t]{2,}", "", x)
Mycorpus <- tm_map(Mycorpus, content_transformer(remove_WhiteSpace))
## Warning in tm_map.SimpleCorpus(Mycorpus,
## content_transformer(remove_WhiteSpace)): transformation drops documents
#stopwards
Mycorpus <- tm_map(Mycorpus, removeWords,stopwords("en"))
## Warning in tm_map.SimpleCorpus(Mycorpus, removeWords, stopwords("en")):
## transformation drops documents
#Lower Case
Mycorpus <- tm_map(Mycorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(Mycorpus, content_transformer(tolower)):
## transformation drops documents
dtm <- DocumentTermMatrix(Mycorpus)
suppressWarnings(wordcloud(Mycorpus, random.order=F, scale=c(3, 0.5), min.freq = 5, col=rainbow(50)))