#Install Packages

# install.packages("tm")  # for text mining
# install.packages("wordcloud") # word-cloud generator 
# install.packages("RColorBrewer") # color palettes
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages("plotly")


##Load Require Library
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(RColorBrewer)
library(wordcloud)
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library("plyr")
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#Read the Data
tweetsDS <- readRDS("C:\\Users\\Admin\\Downloads\\zynga.rds")  
##this should be - M:/S18/CS695/Midterm/Zynga.RDS the forward slashes not backward  
tweets <- tweetsDS$text ###the text here is first column on which we should ideally work.. double click on your zynga dataset you will see the coulmn

# Function to clean tweets
clean.text = function(x)
{
  # remove unicode 
  x = gsub("[^\x20-\x7E]", "",x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
  x = tolower(x)
  return(x)
}
# clean tweets
tweets = clean.text(tweets)

1. Create a word cloud to show what users talk about Zynga on Twitter

corpus = Corpus(VectorSource(tweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â????1â?? is 1st word in the list we want to remove

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 150), head(dm$freq, 150), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)
##     looking         can         now      prized       adult       petra 
##         259         233         222         219         187         187 
##        play        game       trees      jeneva       found     rewards 
##         182         181         180         174         167         164 
##      points         bit       video sponsorship     needing       shook 
##         144         140         139         139         138         138 
##       gotas  rtherescar 
##         137         137

2. Create a histogram to show the distribution of Zynga users’ sentiment

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')

neg.words = c(neg.words, 'wtf', 'fail')

#Implementing our sentiment scoring algorithm
require(plyr)
require(stringr)
## Loading required package: stringr
require(stringi)
## Loading required package: stringi
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

sentiment.scores= score.sentiment(tweets, pos.words, neg.words, .progress='none')

score <- sentiment.scores$score

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(x = ~score, type = "histogram")
p

3. Create a histogram to show Zynga users’ involvement on each weekday

tweetsDS$days <- weekdays(as.POSIXlt(tweetsDS$created))
dfrm <-data.frame(table(tweetsDS[,c("isRetweet","days")]))
tweetsDSDays = reshape(dfrm,direction="wide",timevar="days",idvar="isRetweet")

p <- plot_ly(tweetsDSDays, x = ~isRetweet, y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
  add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
   add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
   add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
   add_trace(y = ~Freq.Friday, name = 'Friday') %>%
   add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
   add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')

p

# 4. Based on your results, Based on your results, provide recommendations for Zynga to increase its monthly active usersBased on your results, provide recommendations for Zynga to increase its monthly active users 1. Based on the word cloud most of the game played that was played is Farmville, most of the gamers are excited about prizes, points and rewards, so it may benefit the company if they focus on these aspects when designing a game. 2. The sentiment analysis suggest that most people are feeling natural when discussing Zynga, but the positive tweets are twice much more as negative ones. 3. Finally, based on the histogram graph people are more likely to tweet about Zynga on saturday and sunday, and the tweets are more likely to be retweeted on friday, saturday and sunday therefore it may be beneficial to release big games these days.