This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

(A) Create a word cloud to show what users talk about Zynga on Twitter.

# summary(zynga-midterm)

# install.packages("readr")
# install.packages("tm")
# install.packages("RColorBrewer")
# install.packages("wordcloud")
# install.packages("plyr")
# install.packages("stats")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("dplyr")
# install.packages("plotly")
# install.packages("ggplot2")
# install.packages("graphics")

library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
library('readr')

zyngaData <- readRDS("M:/S18/CS695/Midterm/Zynga.RDS")
zyngaTweets <- zyngaData$text

#********************************************
#         Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
  # remove unicode 
  x = gsub("[^\x20-\x7E]", " ",x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # tolower
  x = tolower(x)
  return(x)
}

# clean tweets
zyngaTweets = clean.text(zyngaTweets)

#creating wordcloud

corpus = Corpus(VectorSource(zyngaTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,50),
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â????1â?? is 1st word in the list we want to remove

#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 200), head(dm$freq, 200), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

#check top 50 most mentioned words
head(word_freqs, 20)
##     looking         can      prized         now       adult       petra 
##         258         232         219         196         187         187 
##        game      jeneva       found       trees      points         car 
##         184         174         167         166         142         142 
##      rthere         bit       video sponsorship     needing       shook 
##         140         140         139         139         138         138 
##     rewards       gotas 
##         137         137

(B) Create a histogram to show the distribution of Zynga users’ sentiment.

# Sentiment Analysis

pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')

neg.words = c(neg.words, 'wtf', 'fail')

#Implementing our sentiment scoring algorithm
require(plyr)
## Loading required package: plyr
require(stringr)
## Loading required package: stringr
require(stringi)
## Loading required package: stringi
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

sentiment.scores= score.sentiment(zyngaTweets, pos.words, neg.words, .progress='none')

score <- sentiment.scores$score

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(x = ~score, type = "histogram")
p

(C) Create a histogram to show Zynga users’ involvement on each weekday

zyngaData$days <- weekdays(as.POSIXlt(zyngaData$created))

# Get subset of the data to explore
dfrm <-data.frame(table(zyngaData[,c("isRetweet","days")]))
retweetDays = reshape(dfrm,direction="wide",timevar="days",idvar="isRetweet")

p <- plot_ly(retweetDays, x = ~isRetweet, 
             y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
   add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
   add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
   add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
   add_trace(y = ~Freq.Friday, name = 'Friday') %>%
   add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
   add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
   layout(yaxis = list(title = 'Count'), barmode = 'group')

p

(D) Based on your results, provide recommendations for Zynga to increase its monthly active users.

As stated in the case study, on an average, social games have a churn rate of 50% per month - meaning that half of the new players signing up will be gone in a month. To keep the players registered, zynga need to increase the communication between the players appropriately which will help to increase the revenue, also reduce the churn adding a viral popularity of the Zynga games worldwide. Other than this reason, according to the word cloud, sentiment analysis, I notice that users are active on weekends as compared to weekdays. Also, many users are registering to win prizes. So according to the analysis, there can be more active users if Zynga gives more prizes/points on playing games and on weekends there should be bonus points given out for playing actively on Weekends.