R Markdown
#*******************************************************************************************
# Connect to Twitter and Download the tweets based on the hashtag
#*******************************************************************************************
# rm(list=ls(all=TRUE))
#
# setwd("G:\\Great Lakes\\Assignments\\WSMA Group1 Assignment")
# getwd()
#
# #install.packages("SchedulerR")
# library(twitteR)
# library(httr)
#
# #Check below URL
# # https://www.r-bloggers.com/extract-twitter-data-automatically-using-scheduler-r-package/
# api_key <- "xxxx"
# api_secret <- "xxxx"
# access_token <- "xxxx-xxxx"
# access_token_secret <- "xxxx"
# setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
#
#
# library(tm)
# tweets <- searchTwitter("#Jio", n=5000, lang="en")
# tweetsc.df <- twListToDF(tweets)
# dim(tweetsc.df)
# View(tweetsc.df)
#
# write.csv(tweetsc.df, file = "Jio_Tweet.csv",row.names=FALSE)
#*******************************************************************************************
# Load the required R libraries
#*******************************************************************************************
rm(list=ls())
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(RCurl)
## Loading required package: bitops
library(ggplot2)
library(ggplot2)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(twitteR)
library(ROAuth)
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
##
## id
library(stringr)
library(base64enc)
# install.packages("topicmodels")
library(topicmodels)
library(data.table)
library(stringi)
library(devtools)
#install_github('sentiment140', 'okugami79')
library(sentiment)
## Loading required package: rjson
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
##
## Attaching package: 'qdapRegex'
## The following object is masked from 'package:ggplot2':
##
## %+%
## Loading required package: qdapTools
##
## Attaching package: 'qdapTools'
## The following object is masked from 'package:data.table':
##
## shift
## The following object is masked from 'package:plyr':
##
## id
## The following object is masked from 'package:twitteR':
##
## id
##
## Attaching package: 'qdap'
## The following object is masked from 'package:stringr':
##
## %>%
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:base':
##
## Filter
library(qdap)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdap':
##
## %>%
## The following object is masked from 'package:qdapTools':
##
## id
## The following object is masked from 'package:qdapRegex':
##
## explain
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:twitteR':
##
## id, location
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#*******************************************************************************************
# Read the Twitter data already downloaded
#*******************************************************************************************
#read the tweets file
tweets.df <- read.csv("Jio_Tweet.csv",stringsAsFactors = FALSE)
tweets.df$created <- as.Date(tweets.df$created, format= "%Y-%m-%d")
# View(tweets.df)
#*******************************************************************************************
# Process/clean the Twitter data
#*******************************************************************************************
# Remove character string between < >
tweets.df$text <- genX(tweets.df$text, " <", ">")
# Create document corpus with tweet text
myCorpus<- Corpus(VectorSource(tweets.df$text))
# convert to Lowercase
myCorpus <- tm_map(myCorpus, content_transformer(stri_trans_tolower))
# Remove the links (URLs)
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))
# Remove anything except the english language and space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))
# Remove Single letter words
removeSingle <- function(x) gsub(" . ", " ", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeSingle))
# Remove Extra Whitespaces
myCorpus<- tm_map(myCorpus, stripWhitespace)
# keep a copy of "myCorpus" for stem completion later
myCorpusCopy<- myCorpus
# Stem words in the corpus
myCorpus<-tm_map(myCorpus, stemDocument)
writeLines(strwrap(myCorpus[[250]]$content,60))
## rt sanjaybafna relianc jio revis it happi new year plan now
## unlimit call and gb data per day just rs onward
# Function to correct/complete the text after stemming
stemCompletion2 <- function(x,dictionary) {
x <- unlist(strsplit(as.character(x)," "))
x <- x[x !=""]
x <- stemCompletion(x, dictionary = dictionary)
x <- paste(x, sep="", collapse=" ")
PlainTextDocument(stripWhitespace(x))
}
# Stem Complete and Display the same tweet above with the completed and corrected text.
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
writeLines(strwrap(myCorpus[[250]]$content, 60))
## list(content = "rt sanjaybafna reliance jio revised it new
## year plan now unlimited call and gb data per day just rs
## onwards", meta = list(author = character(0), datetimestamp
## = list(sec = 39.3611810207367, min = 13, hour = 4, mday =
## 7, mon = 0, year = 118, wday = 0, yday = 6, isdst = 0),
## description = character(0), heading = character(0), id =
## character(0), language = character(0), origin =
## character(0)))
# Correcting mis-splet words
wordFreq <- function(corpus,word)
{
results<- lapply(corpus,
function(x){ grep(as.character(x),pattern = paste0("\\<", word))})
sum(unlist(results))
}
n.call<- wordFreq(myCorpusCopy, "call")
n.plan <- wordFreq(myCorpusCopy, "plan")
cat(n.call, n.plan)
## 1399 1511
# Used to replace words with the proper ones
replaceWord <- function(corpus, oldword, newword)
{
tm_map(corpus, content_transformer(gsub), pattern=oldword, replacement=newword)
}
myCorpus<- replaceWord(myCorpus, "languag", "language")
# Remove Stopwords
myStopWords<- c((stopwords('english')),c("character","0","1","2","3","4","5","6","7","8","9","10","12","13","14","15","16","17","18","20","21","23","24","25","26","27","28","29","30","31","32","33","34","35","38","36","40","44","45","46","118","119"))
myCorpus<- tm_map(myCorpus,removeWords , myStopWords)
tdm<- TermDocumentMatrix(myCorpus, control= list(wordLengths= c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 4885, documents: 2784)>>
## Non-/sparse entries: 94411/13505429
## Sparsity : 99%
## Maximal term length: 25
## Weighting : term frequency (tf)
idx <- which(dimnames(tdm)$Terms %in% c("call", "plan"))
as.matrix(tdm[idx,21:60])
## Docs
## Terms 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## call 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1
## plan 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## Docs
## Terms 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## call 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## plan 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#*******************************************************************************************
# word frequency, plot and Word Cloud
#*******************************************************************************************
# Find the terms used most frequently
(freq.terms <- findFreqTerms(tdm, lowfreq = 50))
## [1] "author" "call" "content" "data"
## [5] "datetimestamp" "day" "description" "gb"
## [9] "heading" "hour" "id" "isdst"
## [13] "jio" "just" "languagee" "list"
## [17] "mday" "meta" "min" "mon"
## [21] "new" "now" "onwards" "origin"
## [25] "per" "plan" "reliance" "revised"
## [29] "rs" "rt" "sanjaybafna" "sec"
## [33] "unlimited" "wday" "yday" "year"
## [37] "airtel" "network" "will" "19"
## [41] "announce" "offer" "reliancejio" "bangalore"
## [45] "boost" "enter" "groudon" "hatchminuted"
## [49] "left" "raid" "sponsored" "valor"
## [53] "weather" "41" "india" "47"
## [57] "details" "48" "get" "additional"
## [61] "enhanced" "provide" "jiocare" "speed"
## [65] "today" "mystic" "deal" "rcom"
## [69] "acquire" "assets" "communication" "mhz"
## [73] "spectrum" "chennai" "signal" "instinct"
## [77] "minutes" "official" "infocomm" "acquisition"
## [81] "agreement" "breaking" "definitive"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 50)
df <- data.frame(term = names(term.freq), freq= term.freq)
# plotting the graph of frequent terms
ggplot(df, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity") + coord_flip() +labs(list(title="Term Frequency Chart", x="Terms", y="Term Counts"))

# calculate the frequency of words and sort it by frequency and setting up the Wordcloud
word.freq <-sort(rowSums(as.matrix(tdm)), decreasing= F)
pal<- brewer.pal(8, "Dark2")
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 2, random.order = F, colors = pal, max.words = 100)

#*******************************************************************************************
# find word correlations
#*******************************************************************************************
# Identify and plot word correlations. For example - year
WordCorr <- apply_as_df(myCorpus[1:500], word_cor, word = "year", r=.25)
## Warning in FUN(X[[i]], ...): The sd on the following words was 0:
## author, datetimestamp, description, heading, hour, id, isdst, languagee, list, mday, meta, min, mon, origin, sec, wday, yday
plot(WordCorr)

qheat(vect2df(WordCorr[[1]], "word", "cor"), values=TRUE, high="red",
digits=2, order.by ="cor", plot = FALSE) + coord_flip()
## Warning: Ignoring unknown aesthetics: fill

# Find association with a specific keyword in the tweets - say call and plan
findAssocs(tdm, "call", 0.2)
## $call
## unlimited just onwards revised gb per
## 0.95 0.94 0.94 0.94 0.90 0.90
## day year now new rs data
## 0.89 0.87 0.84 0.83 0.81 0.78
## plan sanjaybafna rt reliance
## 0.78 0.76 0.63 0.44
findAssocs(tdm, "plan", 0.2)
## $plan
## new revised onwards unlimited year
## 0.86 0.82 0.81 0.81 0.81
## data day gb just per
## 0.80 0.80 0.80 0.80 0.80
## rs call now sanjaybafna rt
## 0.80 0.78 0.76 0.67 0.63
## reliance price trollcinemaoff discount jiog
## 0.41 0.23 0.23 0.21 0.20
#*******************************************************************************************
# Topic Modelling to identify latent/hidden topics using LDA technique
#*******************************************************************************************
dtm <- as.DocumentTermMatrix(tdm)
rowTotals <- apply(dtm , 1, sum)
NullDocs <- dtm[rowTotals==0, ]
dtm <- dtm[rowTotals> 0, ]
if (length(NullDocs$dimnames$Docs) > 0) {
tweets.df <- tweets.df[-as.numeric(NullDocs$dimnames$Docs),]
}
lda <- LDA(dtm, k = 5) # find 5 topic
term <- terms(lda, 7) # first 7 terms of every topic
(term <- apply(term, MARGIN = 2, paste, collapse = ", "))
## Topic 1
## "list, meta, year, origin, hour, wday, mon"
## Topic 2
## "list, heading, mon, isdst, mday, languagee, rt"
## Topic 3
## "list, jio, id, year, min, origin, yday"
## Topic 4
## "datetimestamp, min, hour, year, content, heading, isdst"
## Topic 5
## "list, yday, id, mon, content, sec, mday"
topics<- topics(lda)
topics<- data.frame(date=(tweets.df$created), topic = topics)
qplot (date, ..count.., data=topics, geom ="density", fill= term[topic], position="stack")
## Warning: `position` is deprecated

#*******************************************************************************************
# Sentiment Analysis
#*******************************************************************************************
# Sentiment Analysis to identify positive/negative tweets
#create calculate.sentiment.score function to do the same
#this function takes wach tweet text and custom files created with positive and negative words collection as inputs
calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
# we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
scores <- laply(sentences, function(sentence, positive.words, negative.words)
{
## clean up sentences with R's regex-driven global substitute, gsub():
sentence <- gsub('[[:cntrl:]]', '', sentence)
# remove retweets
sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
# remove at people
sentence <- gsub('@\\w+', '', sentence)
# remove punctuations
sentence <- gsub('[[:punct:]]', '', sentence)
# remove numbers
sentence <- gsub('[[:digit:]]', '', sentence)
# remove html links
sentence <- gsub('http[s]?\\w+', '', sentence)
# remove extra spaces
sentence <- gsub('[ \t]{2,}', '', sentence)
sentence <- gsub('^\\s+|\\s+$', '', sentence)
# removing NA's
sentence <- sentence[!is.na(sentence)]
# convert to lower case:
sentence <- tolower(sentence)
# split into words. str_split is in the stringr package
word.list <- str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words <- unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
negative.matches <- match(words, negative.words)
positive.matches <- match(words, positive.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
positive.matches <- !is.na(positive.matches)
negative.matches <- !is.na(negative.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score <- sum(positive.matches) - sum(negative.matches)
return(score)
}, positive.words, negative.words, .progress=.progress )
scores.df <- data.frame(score=scores, text=sentences)
return(scores.df)
}
#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
tweets_text <- as.character(tweets.df$text)
#calculate the sentiment score for each tweet
tweets_sentiment <- calculate.sentiment.score(tweets_text, positive, negative, .progress="none")
#Append the date column to the above dataframe and add a truncated date (removing the time)
tweets_sentiment$date <- tweets.df$created
tweets_sentiment$date_trunc <- as.Date(tweets_sentiment$date,format="%d-%m-%Y")
# View(tweets_sentiment)
#Tweets with score >0 are posive and <0 are negative
tweets_sentiment$sentiment[tweets_sentiment$score == 0] <- "Neutral"
tweets_sentiment$sentiment[tweets_sentiment$score < 0] <- "Negative"
tweets_sentiment$sentiment[tweets_sentiment$score > 0] <- "Positive"
tweets_sentiment$sentiment <- factor(tweets_sentiment$sentiment)
#check the tweet counts under each positve and negative score
table(tweets_sentiment$score)
##
## -3 -2 -1 0 1 2 3
## 3 19 111 905 335 1404 7
mean(tweets_sentiment$score)
## [1] 1.079741
median(tweets_sentiment$score)
## [1] 2
#Display the sentiment Summary of Tweets Analysed
summary(tweets_sentiment$sentiment)
## Negative Neutral Positive
## 133 905 1746
#Plot the Sentiment summary of Tweets Analysed
ggplot(data = tweets_sentiment, aes(x = score, fill = sentiment)) +
geom_bar() +
labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Tweet Count") +
scale_x_continuous(breaks = seq(-6,6,1)) +
scale_y_continuous(breaks = seq(0,4000,500)) +
scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

#*******************************************************************************************
# Thank you
#*******************************************************************************************