Getting started with R and R studio (Installation)

Getting Twitter API keys

In order to access Twitter Search or Streaming API, we need to get 4 pieces of information from Twitter: API key, API secret, Access token and Access token secret. You can obtain this information by following the steps below:

1. Extracting tweets with particular hashtag or key word

Once you have obtained the four authentication keys from twitter, you can begin to search for specific keywords of hastags of interest.In the code snippet below, we have extracted two hundred tweets with the keyword “upwork”. These variables can be adjusted as necessary.

#install relevant packages
install.packages("twitteR")
install.packages("ROAuth")
library(twitteR)
library(ROAuth)

#get authentication from Twitter

consumerKey<-   "xxxxxxxxxxxxx"
consumerSecret<-"xxxxxxxxxxxxxx"

accessToken<-"xxxxxxxxx"
accessSecret<-"xxxxxxxxxxxx"

setup_twitter_oauth (consumerKey, consumerSecret, accessToken, accessSecret)  # authenticate

#search for tweets by keyword

tweets<-searchTwitter("upwork", n=200, lang=NULL, since=NULL, until=NULL, locale=NULL, geocode=NULL, sinceID=NULL, maxID=NULL,
                                resultType=NULL, retryOnRateLimit=120)

#Put the tweets in a data frame
tweets<-twListToDF(tweets)

#write out to a csv
write.csv(tweets, file="filename.csv")

2. Retrieve a single user or multiple users timeline.

You can retrieve user’s timeline up to a maximum of 3200 tweets. Install rtweet and provide the authenetication keys from twitter. Use the get_timeline(s) functions to retrieve tweets that have been posted by a user(s). Specify the number of tweets you want to extract, then save your output to your desired format.

install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)

#insert the consumer key and consumer secret from twitter
create_token(
  
  consumer_key = "xxxxxxx",
  consumer_secret = "xxxxxxxxxx"
)
timeline<-get_timeline("OfficialUoM",n=200)  #replace cnn with the target user screen name of ID
timelines<-get_timelines(c("OfficialUoM","skynews"), n=200)#to get multiple users timeline

write.csv(timeline, file="timeline.csv")

3. Get followers of a specific user.

To Retrieve a list of ID’s following a specific user, use the code below.

install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)

#insert the consumer key and consumer secret from twitter
create_token(
  
  consumer_key = "xxxxxxxx",
  consumer_secret = "xxxxxxxx"
)
followers<- get_followers("skynews", n = 5000, page = "-1", retryonratelimit = FALSE,
              parse = TRUE, verbose = TRUE, token = NULL) #get the ID list of followers ID list of 
count<-nrow(followers)#number of followers

write.csv(followers)

4. Get list of accounts followed by a user and retrieve data of those accounts.

To retrieve an ID list of accounts followed by a user use the code below. In addition, using the user ID, you can retrieve the other data from these account such as name, location, language, status count etc. The result will be a data frame with 88 meta data.

install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)

#insert the consumer key and consumer secret from twitter
create_token(
  
  consumer_key = "xxxxxxxxxx",
  consumer_secret = "xxxxxxxxxxxxx"
)

UoM_fds <- get_friends("officialUoM")# get list of accounts followed by the University of manchester official twitter account
write.csv(UoM_fds, file="uomfds.csv") #save result as a csv file
## lookup data on those accounts:name, location, language, etc.
UoM_fds_data <- lookup_users(UoM_fds$user_id)
write.csv(UoM_fds_data, file="uomfdsdata.csv") 

5. Data cleaning.

The code below, outlines the general stems taken to clean a set of tweets. Additional lines may be required to depending on the desired output. Other forms of text data set may require addional steps/forms of cleaning. For this specific code, do ensure that you import a csv file and the the column of text you want to clean is named “Text”.

# install these packages if you do not already have them installed.
install.packages("twitteR")
install.packages("plyr")
install.packages("stringr")
install.packages(tm)

#loading the library
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)
library(tm)

#import your dataset to analyse, ensure it is in the same directory as your code, otherwise you need to add the path

  Dataset2 <- read.csv(dataset)
  tweets.df <- Dataset2$Text


#convert text tolowercase
tweets.df<-tolower(tweets.df)

#get rid of problem characters
tweets.df <- sapply(tweets.df,function(row) iconv(row, "latin1", "ASCII", sub=""))

#remove punctuation, digits, special characters etc
tweets.df = gsub("&amp", "", tweets.df)
tweets.df= gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df)
tweets.df = gsub("@\\w+", "", tweets.df)
tweets.df= gsub("[[:punct:]]", "", tweets.df)
tweets.df = gsub("[[:digit:]]", "", tweets.df)
tweets.df = gsub("http\\w+", "", tweets.df)
tweets.df = gsub("[ \t]{2,}", "", tweets.df)
tweets.df= gsub("^\\s+|\\s+$", "", tweets.df) 


#get rid of unnecessary spaces
tweets.df <- str_replace_all(tweets.df," "," ")
# Get rid of URLs
#tweets.df <- str_replace_all(tweets.df, "http://t.co/[a-z,A-Z,0-9]*{8}","")
# Take out retweet header, there is only one
tweets.df <- str_replace(tweets.df,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweets.df <- str_replace_all(tweets.df,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweets.df <- str_replace_all(tweets.df,"@[a-z,A-Z]*","")  

View(tweets.df)

6. Topic Model.

The code below, outlines the general steps taken to clean a set of tweets. Additional lines may be required to depending on the desired output. Other forms of text data set may require addional steps/forms of cleaning. For this specific code, do ensure that you import a csv file and the the column of text you want to clean is named “Text”.

#install.packages("LDAvis")
#install.packages("tm")
#install.packages("lda")
#install.packages("servr")
#install.packages("shiny")
#install.packages("stringr")

library(LDAvis)
library(tm)
library(lda)
library(shiny)
library(stringr)

stop_words <- stopwords("SMART")

Dataset2<-read.csv("C:/Users/User/Desktop/Dataset2.csv")
tweet <- Dataset2$Text


tweet <- sapply(tweet, function(x) iconv(x, to='UTF-8', sub='byte'))


tweet= gsub("[[:punct:]]", "", tweet)
tweet = gsub("[[:digit:]]", "", tweet)
tweet= gsub("http\\w+", "", tweet)
tweet = gsub("[ \t]{2,}", "", tweet)
tweet= gsub("^\\s+|\\s+$", "", tweet) 
#ref: ( Hicks , 2014) 

#get rid of unnecessary spaces
tweet <- str_replace_all(tweet," "," ")

tweet <- str_replace(tweet,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweet <- str_replace_all(tweet,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweet<- str_replace_all(tweet,"@[a-z,A-Z]*","")  

# tokenize on space and output as a list:
doc.list <- strsplit(tweet, "[[:space:]]+")

# compute the table of terms:
term.table <- table(unlist(doc.list))
term.table <- sort(term.table, decreasing = TRUE)

# remove terms that are stop words or occur fewer than 5 times:
del <- names(term.table) %in% stop_words | term.table < 5
term.table <- term.table[!del]
vocab <- names(term.table)

# now put the documents into the format required by the lda package:
get.terms <- function(x) {
  index <- match(x, vocab)
  index <- index[!is.na(index)]
  rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}
documents <- lapply(doc.list, get.terms)

# Compute some statistics related to the data set:
D <- length(documents)  # number of documents (2,000)
W <- length(vocab)  # number of terms in the vocab (14,568)
doc.length <- sapply(documents, function(x) sum(x[2, ]))  # number of tokens per document 
N <- sum(doc.length)  # total number of tokens in the data 
term.frequency <- as.integer(term.table)  # frequencies of terms in the corpus 


# MCMC and model tuning parameters:
K <- 20
G <- 5000
alpha <- 0.02
eta <- 0.02

# Fit the model:
library(lda)
set.seed(357)
t1 <- Sys.time()
fit <- lda.collapsed.gibbs.sampler(documents = documents, K = 10, vocab = vocab, 
                                   num.iterations = 200, alpha = 0.5, eta=0.5,
                                    initial = NULL, burnin = 0,
                                   compute.log.likelihood = TRUE)
t2 <- Sys.time()
t2 - t1  

#LDAvis
theta <- t(apply(fit$document_sums + 0.5, 2, function(x) x/sum(x)))
phi <- t(apply(t(fit$topics) + 0.5, 2, function(x) x/sum(x)))


tweetvis <- list(phi = phi,
                     theta = theta,
                     doc.length = doc.length,
                     vocab = vocab,
                     term.frequency = term.frequency)


# create the JSON object to feed the visualization:
json <- createJSON(phi = tweetvis$phi, 
                   theta = tweetvis$theta, 
                   doc.length = tweetvis$doc.length, 
                   vocab = tweetvis$vocab, 
                   term.frequency = tweetvis$term.frequency)
serVis(json, out.dir = tempfile(), open.browser = interactive())

The result is an interactive webpage where you can view the different topics and select the number of of terms to include in a topic. You can read up more info on LDA visualisation in this paper (sievert, 2014) https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf

7. Emotion detection

This code below uses the NRC emotion lexicon to detect emotions in a text. For this code, we have removed the negative and positive (sentiment polarity) detections and only show eight emotions (trust, anticipation, sadness, joy, anger, fear, suprise and disgust). This code assumes that the column name for the text to be analyses is called “text”.

library(syuzhet)
library(plotly)
library(tm)
library(wordcloud)

#import your dataset to analyse, ensure it is in the same directory as your code, otherwise you need to add the path
  
tweets <- read.csv(dataset.csv)
  clean_tweets = tweets$text
  
  #clean_tweets = sapply(tweets, function(x) x$getText())
  # remove retweet entities
  clean_tweets = gsub('(RT|via)((?:\\b\\W*@\\w+)+)', '', clean_tweets)
  # remove at people
  clean_tweets = gsub('@\\w+', '', clean_tweets)
  # remove punctuation
  clean_tweets = gsub('[[:punct:]]', '', clean_tweets)
  # remove numbers
  clean_tweets = gsub('[[:digit:]]', '', clean_tweets)
  # remove html links
  clean_tweets = gsub('http\\w+', '', clean_tweets)
  # remove unnecessary spaces
  clean_tweets = gsub('[ \t]{2,}', '', clean_tweets)
  clean_tweets = gsub('^\\s+|\\s+$', '', clean_tweets)
  # remove emojis or special characters
  clean_tweets = gsub('<.*>', '', enc2native(clean_tweets))
  
  clean_tweets = tolower(clean_tweets)
  
  #clean_tweets
  

  
  emotions <- get_nrc_sentiment(clean_tweets)
  emo_bar = colSums(emotions)
  emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
  emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
  
  emo_sum <- emo_sum[1:8,]
  emo_sum$percent<-(emo_sum$count/sum(emo_sum$count))*100
  
   #Visualize the emotions from NRC sentiments
plot_ly(emo_sum, x=~emotion, y=~percent, type="bar", color=~emotion) %>%
layout(xaxis=list(title=""),  yaxis = list(title = "Emotion count"),
showlegend=FALSE,title="Distribution of emotion categories") %>%
layout(yaxis = list(ticksuffix = "%"))

8.Detecting Sentiment Polarity

This code uses an external sentiment lexicon to detect the polarity of a text corpora. It classifies the sentiment of a piece of text as either positive, negative or neutral using the positive and negative sentiment lexicons. Do ensure that the positive and negative lexicon are in the same directory as the code.

#install.packages("twitteR")
#install.packages("plyr")
#install.packages("stringr")
#install.packages("tm")
#install.packages("scales")


#loading the library
library(plyr)
library(stringr)
library(ggplot2)
library(tm)
library(scales)


#read in the file
file<-read.csv(Dataset2.csv)
tweets.df<-file$Text
tweets.df<-tolower(tweets.df)


tweets.df <- sapply(tweets.df,function(row) iconv(row, "latin1", "ASCII", sub=""))

#cleaning the tweets
tweets.df = gsub("&amp", "", tweets.df)
tweets.df= gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df)
tweets.df = gsub("@\\w+", "", tweets.df)
tweets.df= gsub("[[:punct:]]", "", tweets.df)
tweets.df = gsub("[[:digit:]]", "", tweets.df)
tweets.df = gsub("http\\w+", "", tweets.df)
tweets.df = gsub("[ \t]{2,}", "", tweets.df)
tweets.df= gsub("^\\s+|\\s+$", "", tweets.df) 


#get rid of unnecessary spaces
tweets.df <- str_replace_all(tweets.df," "," ")
# Get rid of URLs
#tweets.df <- str_replace_all(tweets.df, "http://t.co/[a-z,A-Z,0-9]*{8}","")
# Take out retweet header, there is only one
tweets.df <- str_replace(tweets.df,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweets.df <- str_replace_all(tweets.df,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweets.df <- str_replace_all(tweets.df,"@[a-z,A-Z]*","")  

#view cleaned tweets
View(tweets.df)


#Reading the Lexicon positive and negative words
pos <- readLines("positive_words.txt")
neg <- readLines("negative_words.txt")

#function to calculate sentiment score
score.sentiment <- function(sentences, pos.words, neg.words, .progress='none')
{
  # Parameters
  # sentences: vector of text to score
  # pos.words: vector of words of postive sentiment
  # neg.words: vector of words of negative sentiment
  # .progress: passed to laply() to control of progress bar
  
  # create simple array of scores with laply
  scores <- laply(sentences,
                  function(sentence, pos.words, neg.words)
                  {
                    # remove punctuation
                    sentence <- gsub("[[:punct:]]", "", sentence)
                    # remove control characters
                    sentence <- gsub("[[:cntrl:]]", "", sentence)
                    # remove digits
                    sentence <- gsub('\\d+', '', sentence)
                    
                    #convert to lower
                    sentence <- tolower(sentence)
                    
                    
                    # split sentence into words with str_split (stringr package)
                    word.list <- str_split(sentence, "\\s+")
                    words <- unlist(word.list)
                    
                    # compare words to the dictionaries of positive & negative terms
                    pos.matches <- match(words, pos)
                    neg.matches <- match(words, neg)
                    
                    # get the position of the matched term or NA
                    # we just want a TRUE/FALSE
                    pos.matches <- !is.na(pos.matches)
                    neg.matches <- !is.na(neg.matches)
                    
                    # final score
                    score <- sum(pos.matches) - sum(neg.matches)
                    return(score)
                  }, pos.words, neg.words, .progress=.progress )
  # data frame with scores for each sentence
  scores.df <- data.frame(text=sentences, score=scores)
  return(scores.df)
}
#sentiment score
scores_twitter <- score.sentiment(tweets.df, pos.txt, neg.txt, .progress='text')


View(scores_twitter)

#Summary of the sentiment scores
summary(scores_twitter)

scores_twitter$score_chr <- ifelse(scores_twitter$score < 0,'Negtive', ifelse(scores_twitter$score > 0, 'Positive', 'Neutral'))


View(scores_twitter)


#Convert score_chr to factor for visualizations
scores_twitter$score_chr <- as.factor(scores_twitter$score_chr)
names(scores_twitter)[3]<-paste("Sentiment")  

#plot to show number of negative, positive and neutral comments
Viz1 <- ggplot(scores_twitter, aes(x=Sentiment, fill=Sentiment))+ geom_bar(aes(y = (..count..)/sum(..count..))) + 
  scale_y_continuous(labels = percent)+labs(y="Score")+
  theme(text =element_text(size=15))+theme(axis.text = element_text(size=15))+ theme(legend.position="none")+ coord_cartesian(ylim=c(0,0.6)) + scale_fill_manual(values=c("firebrick1", "grey50", "limeGREEN"))
Viz1

9. Word Frequency Time Series

The code belows measures the frequency of cetain terms over time (Months of the year). The data used here is from twitter so this code in its form will only work for a data having a similar date format (usually twiter date format with column name created_at).

library(slam)
library(tm)
library(lubridate)
library(syuzhet)
library(dplyr)
library("reshape2")
library("ggplot2")



#read in the data
textdata <- read.csv(dataset, encoding = "UTF-8")

#convert the twitter data format
textdata$created <- as.POSIXct(textdata$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")
#select the months and keep as a date format
textdata$Month <- format(as.Date(textdata$created), "%m")
textdata$Month2 <- months(textdata$created)

#take the text column and convert to a corpus
textdata$doc_id<-textdata$doc_id <- seq_len(nrow(textdata))  # include the doc_id
#text<as.character(textdata$text)
corpus <- Corpus(DataframeSource(textdata))
corpus <- Corpus(DataframeSource(textdata))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)

#form a document term matrix
DTM <- DocumentTermMatrix(corpus)


#select the terms you want to observe
terms_to_observe <- c( "meter", "boiler", "engineer")
#reduce the DTM to contain only those terms
DTM_reduced <- as.matrix(DTM[, terms_to_observe])
#sum the frequecies 
counts_per_month<- aggregate(DTM_reduced, by = list(decade = textdata$Month), sum)


counts_per_month_long <- melt(counts_per_month, id="decade")  # convert to long format

#Visualize the word frequecy time series
p2 <- ggplot(data = counts_per_month_long, aes(x = factor(decade), y = value, colour = variable)) +       
    geom_line(aes(group = variable)) + geom_point() + xlab("Month") +
  ylab("Frequency") +  labs(color='Terms to observe') 

p2

10. Extract and visualize tweet Geocode

This code extracts and visualizes the geocode of tweets relating to a specific keyword search. It requires users to obtain a google geocoding api key. This can be obtained as follows: