WSMA Group Assignment

R Markdown

#*******************************************************************************************
# Connect to Twitter and Download the tweets based on the hashtag
#*******************************************************************************************
# rm(list=ls(all=TRUE))
# 
# setwd("G:\\Great Lakes\\Assignments\\WSMA Group1 Assignment")
# getwd()
# 
# #install.packages("SchedulerR")
# library(twitteR)
# library(httr)
# 
# #Check below URL
# # https://www.r-bloggers.com/extract-twitter-data-automatically-using-scheduler-r-package/
# api_key <- "xxxx"
# api_secret <- "xxxx"
# access_token <- "xxxx-xxxx"
# access_token_secret <- "xxxx"
# setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
# 
# 
# library(tm)
# tweets <- searchTwitter("#Jio", n=5000, lang="en")
# tweetsc.df <- twListToDF(tweets)
# dim(tweetsc.df)
# View(tweetsc.df) 
# 
# write.csv(tweetsc.df, file = "Jio_Tweet.csv",row.names=FALSE)

#*******************************************************************************************
# Load the required R libraries
#*******************************************************************************************

rm(list=ls())
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(RCurl)

## Loading required package: bitops

library(ggplot2)
library(ggplot2)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(twitteR)
library(ROAuth)
library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

library(stringr)
library(base64enc)
# install.packages("topicmodels")
library(topicmodels)
library(data.table)
library(stringi)
library(devtools)
#install_github('sentiment140', 'okugami79')
library(sentiment)

## Loading required package: rjson

library(qdap)

## Loading required package: qdapDictionaries

## Loading required package: qdapRegex

## 
## Attaching package: 'qdapRegex'

## The following object is masked from 'package:ggplot2':
## 
##     %+%

## Loading required package: qdapTools

## 
## Attaching package: 'qdapTools'

## The following object is masked from 'package:data.table':
## 
##     shift

## The following object is masked from 'package:plyr':
## 
##     id

## The following object is masked from 'package:twitteR':
## 
##     id

## 
## Attaching package: 'qdap'

## The following object is masked from 'package:stringr':
## 
##     %>%

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix

## The following object is masked from 'package:NLP':
## 
##     ngrams

## The following object is masked from 'package:base':
## 
##     Filter

library(qdap)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:qdap':
## 
##     %>%

## The following object is masked from 'package:qdapTools':
## 
##     id

## The following object is masked from 'package:qdapRegex':
## 
##     explain

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:twitteR':
## 
##     id, location

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#*******************************************************************************************
# Read the Twitter data already downloaded
#*******************************************************************************************

#read the tweets file
tweets.df <- read.csv("Jio_Tweet.csv",stringsAsFactors = FALSE)
tweets.df$created <- as.Date(tweets.df$created, format= "%Y-%m-%d")
# View(tweets.df)


#*******************************************************************************************
# Process/clean the Twitter data
#*******************************************************************************************

# Remove character string between < >
tweets.df$text <- genX(tweets.df$text, " <", ">")

# Create document corpus with tweet text
myCorpus<- Corpus(VectorSource(tweets.df$text)) 

# convert to Lowercase
myCorpus <- tm_map(myCorpus, content_transformer(stri_trans_tolower))

# Remove the links (URLs)
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)  
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

# Remove anything except the english language and space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)   
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))


# Remove Single letter words
removeSingle <- function(x) gsub(" . ", " ", x)   
myCorpus <- tm_map(myCorpus, content_transformer(removeSingle))

# Remove Extra Whitespaces
myCorpus<- tm_map(myCorpus, stripWhitespace) 

# keep a copy of "myCorpus" for stem completion later
myCorpusCopy<- myCorpus

# Stem words in the corpus
myCorpus<-tm_map(myCorpus, stemDocument)
writeLines(strwrap(myCorpus[[250]]$content,60))

## rt sanjaybafna relianc jio revis it happi new year plan now
## unlimit call and gb data per day just rs onward

# Function to correct/complete the text after stemming
stemCompletion2 <- function(x,dictionary) {
  x <- unlist(strsplit(as.character(x)," "))
  x <- x[x !=""]
  x <- stemCompletion(x, dictionary = dictionary)
  x <- paste(x, sep="", collapse=" ")
  PlainTextDocument(stripWhitespace(x))
}

# Stem Complete and Display the same tweet above with the completed and corrected text.
myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpusCopy)
myCorpus <- Corpus(VectorSource(myCorpus))
writeLines(strwrap(myCorpus[[250]]$content, 60))

## list(content = "rt sanjaybafna reliance jio revised it new
## year plan now unlimited call and gb data per day just rs
## onwards", meta = list(author = character(0), datetimestamp
## = list(sec = 39.3611810207367, min = 13, hour = 4, mday =
## 7, mon = 0, year = 118, wday = 0, yday = 6, isdst = 0),
## description = character(0), heading = character(0), id =
## character(0), language = character(0), origin =
## character(0)))

# Correcting mis-splet words
wordFreq <- function(corpus,word)
{
  results<- lapply(corpus,
                   function(x){ grep(as.character(x),pattern = paste0("\\<", word))})
  sum(unlist(results))
}
n.call<- wordFreq(myCorpusCopy, "call")
n.plan <- wordFreq(myCorpusCopy, "plan")
cat(n.call, n.plan)

## 1399 1511

# Used to replace words with the proper ones
replaceWord <- function(corpus, oldword, newword)
{
  tm_map(corpus, content_transformer(gsub), pattern=oldword, replacement=newword)
}
myCorpus<- replaceWord(myCorpus, "languag", "language")

# Remove Stopwords
myStopWords<- c((stopwords('english')),c("character","0","1","2","3","4","5","6","7","8","9","10","12","13","14","15","16","17","18","20","21","23","24","25","26","27","28","29","30","31","32","33","34","35","38","36","40","44","45","46","118","119"))
myCorpus<- tm_map(myCorpus,removeWords , myStopWords) 

tdm<- TermDocumentMatrix(myCorpus, control= list(wordLengths= c(1, Inf)))
tdm

## <<TermDocumentMatrix (terms: 4885, documents: 2784)>>
## Non-/sparse entries: 94411/13505429
## Sparsity           : 99%
## Maximal term length: 25
## Weighting          : term frequency (tf)

idx <- which(dimnames(tdm)$Terms %in% c("call", "plan"))
as.matrix(tdm[idx,21:60])

##       Docs
## Terms  21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
##   call  1  1  1  1  1  1  1  1  1  1  1  1  1  1  0  0  1  1  1  1  1  1
##   plan  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  0  1  1  1  1  1  1
##       Docs
## Terms  43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
##   call  0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
##   plan  0  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1

#*******************************************************************************************
# word frequency, plot and Word Cloud
#*******************************************************************************************

# Find the terms used most frequently
(freq.terms <- findFreqTerms(tdm, lowfreq = 50))

##  [1] "author"        "call"          "content"       "data"         
##  [5] "datetimestamp" "day"           "description"   "gb"           
##  [9] "heading"       "hour"          "id"            "isdst"        
## [13] "jio"           "just"          "languagee"     "list"         
## [17] "mday"          "meta"          "min"           "mon"          
## [21] "new"           "now"           "onwards"       "origin"       
## [25] "per"           "plan"          "reliance"      "revised"      
## [29] "rs"            "rt"            "sanjaybafna"   "sec"          
## [33] "unlimited"     "wday"          "yday"          "year"         
## [37] "airtel"        "network"       "will"          "19"           
## [41] "announce"      "offer"         "reliancejio"   "bangalore"    
## [45] "boost"         "enter"         "groudon"       "hatchminuted" 
## [49] "left"          "raid"          "sponsored"     "valor"        
## [53] "weather"       "41"            "india"         "47"           
## [57] "details"       "48"            "get"           "additional"   
## [61] "enhanced"      "provide"       "jiocare"       "speed"        
## [65] "today"         "mystic"        "deal"          "rcom"         
## [69] "acquire"       "assets"        "communication" "mhz"          
## [73] "spectrum"      "chennai"       "signal"        "instinct"     
## [77] "minutes"       "official"      "infocomm"      "acquisition"  
## [81] "agreement"     "breaking"      "definitive"

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq > 50)
df <- data.frame(term = names(term.freq), freq= term.freq)

# plotting the graph of frequent terms
ggplot(df, aes(reorder(term, freq),freq)) + theme_bw() + geom_bar(stat = "identity")  + coord_flip() +labs(list(title="Term Frequency Chart", x="Terms", y="Term Counts"))

# calculate the frequency of words and sort it by frequency and setting up the Wordcloud
word.freq <-sort(rowSums(as.matrix(tdm)), decreasing= F)
pal<- brewer.pal(8, "Dark2")
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 2, random.order = F, colors = pal, max.words = 100)

#*******************************************************************************************
# find word correlations
#*******************************************************************************************

# Identify and plot word correlations. For example - year
WordCorr <- apply_as_df(myCorpus[1:500], word_cor, word = "year", r=.25)

## Warning in FUN(X[[i]], ...): The sd on the following words was 0:
## author, datetimestamp, description, heading, hour, id, isdst, languagee, list, mday, meta, min, mon, origin, sec, wday, yday

plot(WordCorr)

qheat(vect2df(WordCorr[[1]], "word", "cor"), values=TRUE, high="red",
      digits=2, order.by ="cor", plot = FALSE) + coord_flip()

## Warning: Ignoring unknown aesthetics: fill

# Find association with a specific keyword in the tweets - say call and plan
findAssocs(tdm, "call", 0.2)

## $call
##   unlimited        just     onwards     revised          gb         per 
##        0.95        0.94        0.94        0.94        0.90        0.90 
##         day        year         now         new          rs        data 
##        0.89        0.87        0.84        0.83        0.81        0.78 
##        plan sanjaybafna          rt    reliance 
##        0.78        0.76        0.63        0.44

findAssocs(tdm, "plan", 0.2)

## $plan
##            new        revised        onwards      unlimited           year 
##           0.86           0.82           0.81           0.81           0.81 
##           data            day             gb           just            per 
##           0.80           0.80           0.80           0.80           0.80 
##             rs           call            now    sanjaybafna             rt 
##           0.80           0.78           0.76           0.67           0.63 
##       reliance          price trollcinemaoff       discount           jiog 
##           0.41           0.23           0.23           0.21           0.20

#*******************************************************************************************
# Topic Modelling to identify latent/hidden topics using LDA technique
#*******************************************************************************************

dtm <- as.DocumentTermMatrix(tdm)

rowTotals <- apply(dtm , 1, sum)

NullDocs <- dtm[rowTotals==0, ]
dtm   <- dtm[rowTotals> 0, ]

if (length(NullDocs$dimnames$Docs) > 0) {
  tweets.df <- tweets.df[-as.numeric(NullDocs$dimnames$Docs),]
}

lda <- LDA(dtm, k = 5) # find 5 topic
term <- terms(lda, 7) # first 7 terms of every topic
(term <- apply(term, MARGIN = 2, paste, collapse = ", "))

##                                                   Topic 1 
##               "list, meta, year, origin, hour, wday, mon" 
##                                                   Topic 2 
##          "list, heading, mon, isdst, mday, languagee, rt" 
##                                                   Topic 3 
##                  "list, jio, id, year, min, origin, yday" 
##                                                   Topic 4 
## "datetimestamp, min, hour, year, content, heading, isdst" 
##                                                   Topic 5 
##                 "list, yday, id, mon, content, sec, mday"

topics<- topics(lda)
topics<- data.frame(date=(tweets.df$created), topic = topics)
qplot (date, ..count.., data=topics, geom ="density", fill= term[topic], position="stack")

## Warning: `position` is deprecated

#*******************************************************************************************
# Sentiment Analysis
#*******************************************************************************************

# Sentiment Analysis to identify positive/negative tweets
#create calculate.sentiment.score function to do the same
#this function takes wach tweet text and custom files created with positive and negative words collection as inputs

calculate.sentiment.score <- function(sentences, positive.words, negative.words, .progress='none')
{
  require(plyr)
  require(stringr)
  
  # we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
  # we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
  scores <- laply(sentences, function(sentence, positive.words, negative.words)
  {
    
    ## clean up sentences with R's regex-driven global substitute, gsub():
    
    sentence <- gsub('[[:cntrl:]]', '', sentence)
    
    # remove retweets
    sentence <- gsub('(RT|via)((?:\\b\\W*@\\W+)+)', '', sentence)
    
    # remove at people
    sentence <- gsub('@\\w+', '', sentence)
    
    # remove punctuations
    sentence <- gsub('[[:punct:]]', '', sentence)
    
    # remove numbers
    sentence <- gsub('[[:digit:]]', '', sentence)
    
    # remove html links
    sentence <- gsub('http[s]?\\w+', '', sentence)
    
    # remove extra spaces
    sentence <- gsub('[ \t]{2,}', '', sentence)
    sentence <- gsub('^\\s+|\\s+$', '', sentence)
    
    # removing NA's
    sentence <- sentence[!is.na(sentence)]
    
    # convert to lower case:
    sentence <- tolower(sentence)
    
    # split into words. str_split is in the stringr package
    
    word.list <- str_split(sentence, '\\s+')
    
    # sometimes a list() is one level of hierarchy too much
    
    words <- unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    
    negative.matches <- match(words, negative.words)
    positive.matches <- match(words, positive.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    
    positive.matches <- !is.na(positive.matches)
    negative.matches <- !is.na(negative.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    
    score <- sum(positive.matches) - sum(negative.matches)
    
    return(score)
  }, positive.words, negative.words, .progress=.progress )
  
  scores.df <- data.frame(score=scores, text=sentences)
  return(scores.df)
}

#Read the positvie and negative words from custom files created
positive <- scan("positive-words.txt", what= "character", comment.char= ";")
negative <- scan("negative-words.txt", what= "character", comment.char= ";")
tweets_text <- as.character(tweets.df$text)

#calculate the sentiment score for each tweet
tweets_sentiment <- calculate.sentiment.score(tweets_text, positive, negative, .progress="none")

#Append the date column to the above dataframe and add a truncated date (removing the time)
tweets_sentiment$date <- tweets.df$created
tweets_sentiment$date_trunc <- as.Date(tweets_sentiment$date,format="%d-%m-%Y")
# View(tweets_sentiment)

#Tweets with score >0 are posive and <0 are negative
tweets_sentiment$sentiment[tweets_sentiment$score == 0] <- "Neutral" 
tweets_sentiment$sentiment[tweets_sentiment$score < 0] <- "Negative"
tweets_sentiment$sentiment[tweets_sentiment$score > 0] <- "Positive"
tweets_sentiment$sentiment <- factor(tweets_sentiment$sentiment)


#check the tweet counts under each positve and negative score
table(tweets_sentiment$score)

## 
##   -3   -2   -1    0    1    2    3 
##    3   19  111  905  335 1404    7

mean(tweets_sentiment$score)

## [1] 1.079741

median(tweets_sentiment$score)

## [1] 2

#Display the sentiment Summary of Tweets Analysed
summary(tweets_sentiment$sentiment)

## Negative  Neutral Positive 
##      133      905     1746

#Plot the Sentiment summary of Tweets Analysed
ggplot(data = tweets_sentiment, aes(x = score, fill = sentiment)) + 
  geom_bar() + 
  labs(title = "Sentiment Score Bar Plot", x = "Sentiment Score", y = "Tweet Count") +
  scale_x_continuous(breaks = seq(-6,6,1)) + 
  scale_y_continuous(breaks = seq(0,4000,500)) + 
  scale_fill_manual(guide = guide_legend("Sentiment"), values = c("#DD0426","#246EB9","#04B430"))

#*******************************************************************************************
# Thank you
#*******************************************************************************************

WSMA Group Assignment

Vyomesh

7 January 2018

R Markdown