First,we need to download R to our local computer from https://cran.r-project.org/ CRAN (Comprehensive R Archive Network).Download the package that applies to your computer and follow the prompts to install.
Next we need to download and install R studio from https://www.rstudio.com/products/rstudio/download/ . Select the free R studio Desktop open source lince and choose the package that applies to your computer.
In order to access Twitter Search or Streaming API, we need to get 4 pieces of information from Twitter: API key, API secret, Access token and Access token secret. You can obtain this information by following the steps below:
Once you have obtained the four authentication keys from twitter, you can begin to search for specific keywords of hastags of interest.In the code snippet below, we have extracted two hundred tweets with the keyword “upwork”. These variables can be adjusted as necessary.
#install relevant packages
install.packages("twitteR")
install.packages("ROAuth")
library(twitteR)
library(ROAuth)
#get authentication from Twitter
consumerKey<- "xxxxxxxxxxxxx"
consumerSecret<-"xxxxxxxxxxxxxx"
accessToken<-"xxxxxxxxx"
accessSecret<-"xxxxxxxxxxxx"
setup_twitter_oauth (consumerKey, consumerSecret, accessToken, accessSecret) # authenticate
#search for tweets by keyword
tweets<-searchTwitter("upwork", n=200, lang=NULL, since=NULL, until=NULL, locale=NULL, geocode=NULL, sinceID=NULL, maxID=NULL,
resultType=NULL, retryOnRateLimit=120)
#Put the tweets in a data frame
tweets<-twListToDF(tweets)
#write out to a csv
write.csv(tweets, file="filename.csv")
You can retrieve user’s timeline up to a maximum of 3200 tweets. Install rtweet and provide the authenetication keys from twitter. Use the get_timeline(s) functions to retrieve tweets that have been posted by a user(s). Specify the number of tweets you want to extract, then save your output to your desired format.
install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)
#insert the consumer key and consumer secret from twitter
create_token(
consumer_key = "xxxxxxx",
consumer_secret = "xxxxxxxxxx"
)
timeline<-get_timeline("OfficialUoM",n=200) #replace cnn with the target user screen name of ID
timelines<-get_timelines(c("OfficialUoM","skynews"), n=200)#to get multiple users timeline
write.csv(timeline, file="timeline.csv")
To Retrieve a list of ID’s following a specific user, use the code below.
install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)
#insert the consumer key and consumer secret from twitter
create_token(
consumer_key = "xxxxxxxx",
consumer_secret = "xxxxxxxx"
)
followers<- get_followers("skynews", n = 5000, page = "-1", retryonratelimit = FALSE,
parse = TRUE, verbose = TRUE, token = NULL) #get the ID list of followers ID list of
count<-nrow(followers)#number of followers
write.csv(followers)
To retrieve an ID list of accounts followed by a user use the code below. In addition, using the user ID, you can retrieve the other data from these account such as name, location, language, status count etc. The result will be a data frame with 88 meta data.
install.packages("rtweet")#install rtweet if not alredy installed
library(rtweet)
#insert the consumer key and consumer secret from twitter
create_token(
consumer_key = "xxxxxxxxxx",
consumer_secret = "xxxxxxxxxxxxx"
)
UoM_fds <- get_friends("officialUoM")# get list of accounts followed by the University of manchester official twitter account
write.csv(UoM_fds, file="uomfds.csv") #save result as a csv file
## lookup data on those accounts:name, location, language, etc.
UoM_fds_data <- lookup_users(UoM_fds$user_id)
write.csv(UoM_fds_data, file="uomfdsdata.csv")
The code below, outlines the general stems taken to clean a set of tweets. Additional lines may be required to depending on the desired output. Other forms of text data set may require addional steps/forms of cleaning. For this specific code, do ensure that you import a csv file and the the column of text you want to clean is named “Text”.
# install these packages if you do not already have them installed.
install.packages("twitteR")
install.packages("plyr")
install.packages("stringr")
install.packages(tm)
#loading the library
library(twitteR)
library(plyr)
library(stringr)
library(ggplot2)
library(tm)
#import your dataset to analyse, ensure it is in the same directory as your code, otherwise you need to add the path
Dataset2 <- read.csv(dataset)
tweets.df <- Dataset2$Text
#convert text tolowercase
tweets.df<-tolower(tweets.df)
#get rid of problem characters
tweets.df <- sapply(tweets.df,function(row) iconv(row, "latin1", "ASCII", sub=""))
#remove punctuation, digits, special characters etc
tweets.df = gsub("&", "", tweets.df)
tweets.df= gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df)
tweets.df = gsub("@\\w+", "", tweets.df)
tweets.df= gsub("[[:punct:]]", "", tweets.df)
tweets.df = gsub("[[:digit:]]", "", tweets.df)
tweets.df = gsub("http\\w+", "", tweets.df)
tweets.df = gsub("[ \t]{2,}", "", tweets.df)
tweets.df= gsub("^\\s+|\\s+$", "", tweets.df)
#get rid of unnecessary spaces
tweets.df <- str_replace_all(tweets.df," "," ")
# Get rid of URLs
#tweets.df <- str_replace_all(tweets.df, "http://t.co/[a-z,A-Z,0-9]*{8}","")
# Take out retweet header, there is only one
tweets.df <- str_replace(tweets.df,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweets.df <- str_replace_all(tweets.df,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweets.df <- str_replace_all(tweets.df,"@[a-z,A-Z]*","")
View(tweets.df)
The code below, outlines the general steps taken to clean a set of tweets. Additional lines may be required to depending on the desired output. Other forms of text data set may require addional steps/forms of cleaning. For this specific code, do ensure that you import a csv file and the the column of text you want to clean is named “Text”.
#install.packages("LDAvis")
#install.packages("tm")
#install.packages("lda")
#install.packages("servr")
#install.packages("shiny")
#install.packages("stringr")
library(LDAvis)
library(tm)
library(lda)
library(shiny)
library(stringr)
stop_words <- stopwords("SMART")
Dataset2<-read.csv("C:/Users/User/Desktop/Dataset2.csv")
tweet <- Dataset2$Text
tweet <- sapply(tweet, function(x) iconv(x, to='UTF-8', sub='byte'))
tweet= gsub("[[:punct:]]", "", tweet)
tweet = gsub("[[:digit:]]", "", tweet)
tweet= gsub("http\\w+", "", tweet)
tweet = gsub("[ \t]{2,}", "", tweet)
tweet= gsub("^\\s+|\\s+$", "", tweet)
#ref: ( Hicks , 2014)
#get rid of unnecessary spaces
tweet <- str_replace_all(tweet," "," ")
tweet <- str_replace(tweet,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweet <- str_replace_all(tweet,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweet<- str_replace_all(tweet,"@[a-z,A-Z]*","")
# tokenize on space and output as a list:
doc.list <- strsplit(tweet, "[[:space:]]+")
# compute the table of terms:
term.table <- table(unlist(doc.list))
term.table <- sort(term.table, decreasing = TRUE)
# remove terms that are stop words or occur fewer than 5 times:
del <- names(term.table) %in% stop_words | term.table < 5
term.table <- term.table[!del]
vocab <- names(term.table)
# now put the documents into the format required by the lda package:
get.terms <- function(x) {
index <- match(x, vocab)
index <- index[!is.na(index)]
rbind(as.integer(index - 1), as.integer(rep(1, length(index))))
}
documents <- lapply(doc.list, get.terms)
# Compute some statistics related to the data set:
D <- length(documents) # number of documents (2,000)
W <- length(vocab) # number of terms in the vocab (14,568)
doc.length <- sapply(documents, function(x) sum(x[2, ])) # number of tokens per document
N <- sum(doc.length) # total number of tokens in the data
term.frequency <- as.integer(term.table) # frequencies of terms in the corpus
# MCMC and model tuning parameters:
K <- 20
G <- 5000
alpha <- 0.02
eta <- 0.02
# Fit the model:
library(lda)
set.seed(357)
t1 <- Sys.time()
fit <- lda.collapsed.gibbs.sampler(documents = documents, K = 10, vocab = vocab,
num.iterations = 200, alpha = 0.5, eta=0.5,
initial = NULL, burnin = 0,
compute.log.likelihood = TRUE)
t2 <- Sys.time()
t2 - t1
#LDAvis
theta <- t(apply(fit$document_sums + 0.5, 2, function(x) x/sum(x)))
phi <- t(apply(t(fit$topics) + 0.5, 2, function(x) x/sum(x)))
tweetvis <- list(phi = phi,
theta = theta,
doc.length = doc.length,
vocab = vocab,
term.frequency = term.frequency)
# create the JSON object to feed the visualization:
json <- createJSON(phi = tweetvis$phi,
theta = tweetvis$theta,
doc.length = tweetvis$doc.length,
vocab = tweetvis$vocab,
term.frequency = tweetvis$term.frequency)
serVis(json, out.dir = tempfile(), open.browser = interactive())
The result is an interactive webpage where you can view the different topics and select the number of of terms to include in a topic. You can read up more info on LDA visualisation in this paper (sievert, 2014) https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf
This code below uses the NRC emotion lexicon to detect emotions in a text. For this code, we have removed the negative and positive (sentiment polarity) detections and only show eight emotions (trust, anticipation, sadness, joy, anger, fear, suprise and disgust). This code assumes that the column name for the text to be analyses is called “text”.
library(syuzhet)
library(plotly)
library(tm)
library(wordcloud)
#import your dataset to analyse, ensure it is in the same directory as your code, otherwise you need to add the path
tweets <- read.csv(dataset.csv)
clean_tweets = tweets$text
#clean_tweets = sapply(tweets, function(x) x$getText())
# remove retweet entities
clean_tweets = gsub('(RT|via)((?:\\b\\W*@\\w+)+)', '', clean_tweets)
# remove at people
clean_tweets = gsub('@\\w+', '', clean_tweets)
# remove punctuation
clean_tweets = gsub('[[:punct:]]', '', clean_tweets)
# remove numbers
clean_tweets = gsub('[[:digit:]]', '', clean_tweets)
# remove html links
clean_tweets = gsub('http\\w+', '', clean_tweets)
# remove unnecessary spaces
clean_tweets = gsub('[ \t]{2,}', '', clean_tweets)
clean_tweets = gsub('^\\s+|\\s+$', '', clean_tweets)
# remove emojis or special characters
clean_tweets = gsub('<.*>', '', enc2native(clean_tweets))
clean_tweets = tolower(clean_tweets)
#clean_tweets
emotions <- get_nrc_sentiment(clean_tweets)
emo_bar = colSums(emotions)
emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
emo_sum <- emo_sum[1:8,]
emo_sum$percent<-(emo_sum$count/sum(emo_sum$count))*100
#Visualize the emotions from NRC sentiments
plot_ly(emo_sum, x=~emotion, y=~percent, type="bar", color=~emotion) %>%
layout(xaxis=list(title=""), yaxis = list(title = "Emotion count"),
showlegend=FALSE,title="Distribution of emotion categories") %>%
layout(yaxis = list(ticksuffix = "%"))
This code uses an external sentiment lexicon to detect the polarity of a text corpora. It classifies the sentiment of a piece of text as either positive, negative or neutral using the positive and negative sentiment lexicons. Do ensure that the positive and negative lexicon are in the same directory as the code.
#install.packages("twitteR")
#install.packages("plyr")
#install.packages("stringr")
#install.packages("tm")
#install.packages("scales")
#loading the library
library(plyr)
library(stringr)
library(ggplot2)
library(tm)
library(scales)
#read in the file
file<-read.csv(Dataset2.csv)
tweets.df<-file$Text
tweets.df<-tolower(tweets.df)
tweets.df <- sapply(tweets.df,function(row) iconv(row, "latin1", "ASCII", sub=""))
#cleaning the tweets
tweets.df = gsub("&", "", tweets.df)
tweets.df= gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df)
tweets.df = gsub("@\\w+", "", tweets.df)
tweets.df= gsub("[[:punct:]]", "", tweets.df)
tweets.df = gsub("[[:digit:]]", "", tweets.df)
tweets.df = gsub("http\\w+", "", tweets.df)
tweets.df = gsub("[ \t]{2,}", "", tweets.df)
tweets.df= gsub("^\\s+|\\s+$", "", tweets.df)
#get rid of unnecessary spaces
tweets.df <- str_replace_all(tweets.df," "," ")
# Get rid of URLs
#tweets.df <- str_replace_all(tweets.df, "http://t.co/[a-z,A-Z,0-9]*{8}","")
# Take out retweet header, there is only one
tweets.df <- str_replace(tweets.df,"RT @[a-z,A-Z]*: ","")
# Get rid of hashtags
tweets.df <- str_replace_all(tweets.df,"#[a-z,A-Z]*","")
# Get rid of references to other screennames
tweets.df <- str_replace_all(tweets.df,"@[a-z,A-Z]*","")
#view cleaned tweets
View(tweets.df)
#Reading the Lexicon positive and negative words
pos <- readLines("positive_words.txt")
neg <- readLines("negative_words.txt")
#function to calculate sentiment score
score.sentiment <- function(sentences, pos.words, neg.words, .progress='none')
{
# Parameters
# sentences: vector of text to score
# pos.words: vector of words of postive sentiment
# neg.words: vector of words of negative sentiment
# .progress: passed to laply() to control of progress bar
# create simple array of scores with laply
scores <- laply(sentences,
function(sentence, pos.words, neg.words)
{
# remove punctuation
sentence <- gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence <- gsub("[[:cntrl:]]", "", sentence)
# remove digits
sentence <- gsub('\\d+', '', sentence)
#convert to lower
sentence <- tolower(sentence)
# split sentence into words with str_split (stringr package)
word.list <- str_split(sentence, "\\s+")
words <- unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches <- match(words, pos)
neg.matches <- match(words, neg)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches <- !is.na(pos.matches)
neg.matches <- !is.na(neg.matches)
# final score
score <- sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
# data frame with scores for each sentence
scores.df <- data.frame(text=sentences, score=scores)
return(scores.df)
}
#sentiment score
scores_twitter <- score.sentiment(tweets.df, pos.txt, neg.txt, .progress='text')
View(scores_twitter)
#Summary of the sentiment scores
summary(scores_twitter)
scores_twitter$score_chr <- ifelse(scores_twitter$score < 0,'Negtive', ifelse(scores_twitter$score > 0, 'Positive', 'Neutral'))
View(scores_twitter)
#Convert score_chr to factor for visualizations
scores_twitter$score_chr <- as.factor(scores_twitter$score_chr)
names(scores_twitter)[3]<-paste("Sentiment")
#plot to show number of negative, positive and neutral comments
Viz1 <- ggplot(scores_twitter, aes(x=Sentiment, fill=Sentiment))+ geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels = percent)+labs(y="Score")+
theme(text =element_text(size=15))+theme(axis.text = element_text(size=15))+ theme(legend.position="none")+ coord_cartesian(ylim=c(0,0.6)) + scale_fill_manual(values=c("firebrick1", "grey50", "limeGREEN"))
Viz1
The code belows measures the frequency of cetain terms over time (Months of the year). The data used here is from twitter so this code in its form will only work for a data having a similar date format (usually twiter date format with column name created_at).
library(slam)
library(tm)
library(lubridate)
library(syuzhet)
library(dplyr)
library("reshape2")
library("ggplot2")
#read in the data
textdata <- read.csv(dataset, encoding = "UTF-8")
#convert the twitter data format
textdata$created <- as.POSIXct(textdata$created_at, format="%a %b %d %H:%M:%S +0000 %Y", tz="GMT")
#select the months and keep as a date format
textdata$Month <- format(as.Date(textdata$created), "%m")
textdata$Month2 <- months(textdata$created)
#take the text column and convert to a corpus
textdata$doc_id<-textdata$doc_id <- seq_len(nrow(textdata)) # include the doc_id
#text<as.character(textdata$text)
corpus <- Corpus(DataframeSource(textdata))
corpus <- Corpus(DataframeSource(textdata))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
#form a document term matrix
DTM <- DocumentTermMatrix(corpus)
#select the terms you want to observe
terms_to_observe <- c( "meter", "boiler", "engineer")
#reduce the DTM to contain only those terms
DTM_reduced <- as.matrix(DTM[, terms_to_observe])
#sum the frequecies
counts_per_month<- aggregate(DTM_reduced, by = list(decade = textdata$Month), sum)
counts_per_month_long <- melt(counts_per_month, id="decade") # convert to long format
#Visualize the word frequecy time series
p2 <- ggplot(data = counts_per_month_long, aes(x = factor(decade), y = value, colour = variable)) +
geom_line(aes(group = variable)) + geom_point() + xlab("Month") +
ylab("Frequency") + labs(color='Terms to observe')
p2
This code extracts and visualizes the geocode of tweets relating to a specific keyword search. It requires users to obtain a google geocoding api key. This can be obtained as follows: