This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# summary(zynga-midterm)
# install.packages("readr")
# install.packages("tm")
# install.packages("RColorBrewer")
# install.packages("wordcloud")
# install.packages("plyr")
# install.packages("stats")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("dplyr")
# install.packages("plotly")
# install.packages("ggplot2")
# install.packages("graphics")
library('tm')
## Loading required package: NLP
library('RColorBrewer')
library('wordcloud')
library('readr')
zyngaData <- readRDS("M:/S18/CS695/Midterm/Zynga.RDS")
zyngaTweets <- zyngaData$text
#********************************************
# Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
# remove unicode
x = gsub("[^\x20-\x7E]", " ",x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
x = tolower(x)
return(x)
}
# clean tweets
zyngaTweets = clean.text(zyngaTweets)
#creating wordcloud
corpus = Corpus(VectorSource(zyngaTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
corpus,
control = list(
wordLengths=c(3,50),
removePunctuation = TRUE,
stopwords = c("the", "a", stopwords("english")),
removeNumbers = TRUE) )
# convert as matrix
tdm = as.matrix(tdm)
# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE)
# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)
#remove the top words which donâ????t generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)] #Here â????1â?? is 1st word in the list we want to remove
#Plot corpus in a clored graph; need RColorBrewer package
wordcloud(head(dm$word, 200), head(dm$freq, 200), random.order=FALSE, colors=brewer.pal(8, "Dark2"))
#check top 50 most mentioned words
head(word_freqs, 20)
## looking can prized now adult petra
## 258 232 219 196 187 187
## game jeneva found trees points car
## 184 174 167 166 142 142
## rthere bit video sponsorship needing shook
## 140 140 139 139 138 138
## rewards gotas
## 137 137
# Sentiment Analysis
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
neg.words = c(neg.words, 'wtf', 'fail')
#Implementing our sentiment scoring algorithm
require(plyr)
## Loading required package: plyr
require(stringr)
## Loading required package: stringr
require(stringi)
## Loading required package: stringi
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
sentiment.scores= score.sentiment(zyngaTweets, pos.words, neg.words, .progress='none')
score <- sentiment.scores$score
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(x = ~score, type = "histogram")
p
zyngaData$days <- weekdays(as.POSIXlt(zyngaData$created))
# Get subset of the data to explore
dfrm <-data.frame(table(zyngaData[,c("isRetweet","days")]))
retweetDays = reshape(dfrm,direction="wide",timevar="days",idvar="isRetweet")
p <- plot_ly(retweetDays, x = ~isRetweet,
y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
add_trace(y = ~Freq.Friday, name = 'Friday') %>%
add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p
As stated in the case study, on an average, social games have a churn rate of 50% per month - meaning that half of the new players signing up will be gone in a month. To keep the players registered, zynga need to increase the communication between the players appropriately which will help to increase the revenue, also reduce the churn adding a viral popularity of the Zynga games worldwide. Other than this reason, according to the word cloud, sentiment analysis, I notice that users are active on weekends as compared to weekdays. Also, many users are registering to win prizes. So according to the analysis, there can be more active users if Zynga gives more prizes/points on playing games and on weekends there should be bonus points given out for playing actively on Weekends.