#Install Packages
# install.packages("tm") # for text mining
# install.packages("wordcloud") # word-cloud generator
# install.packages("RColorBrewer") # color palettes
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages("plotly")
##Load Require Library
library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
library(RColorBrewer)
library(wordcloud)
library(readr)
## Warning: package 'readr' was built under R version 3.4.4
library("plyr")
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Read the Data
tweetsDS <- readRDS("C:\\Users\\Admin\\Downloads\\zynga.rds")
##this should be - M:/S18/CS695/Midterm/Zynga.RDS the forward slashes not backward
tweets <- tweetsDS$text ###the text here is first column on which we should ideally work.. double click on your zynga dataset you will see the coulmn
# Function to clean tweets
clean.text = function(x)
{
# remove unicode
x = gsub("[^\x20-\x7E]", "",x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
# tolower
x = tolower(x)
return(x)
}
# clean tweets
tweets = clean.text(tweets)
2. Create a histogram to show the distribution of Zynga users’ sentiment
pos.words = scan('positive-words.txt', what='character', comment.char=';')
neg.words = scan('negative-words.txt', what='character', comment.char=';')
neg.words = c(neg.words, 'wtf', 'fail')
#Implementing our sentiment scoring algorithm
require(plyr)
require(stringr)
## Loading required package: stringr
require(stringi)
## Loading required package: stringi
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
sentiment.scores= score.sentiment(tweets, pos.words, neg.words, .progress='none')
score <- sentiment.scores$score
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- plot_ly(x = ~score, type = "histogram")
p
3. Create a histogram to show Zynga users’ involvement on each weekday
tweetsDS$days <- weekdays(as.POSIXlt(tweetsDS$created))
dfrm <-data.frame(table(tweetsDS[,c("isRetweet","days")]))
tweetsDSDays = reshape(dfrm,direction="wide",timevar="days",idvar="isRetweet")
p <- plot_ly(tweetsDSDays, x = ~isRetweet, y = ~Freq.Monday, type = 'bar', name = 'Monday') %>%
add_trace(y = ~Freq.Tuesday, name = 'Tuesday') %>%
add_trace(y = ~Freq.Wednesday, name = 'Wednesday') %>%
add_trace(y = ~Freq.Thursday, name = 'Thursday') %>%
add_trace(y = ~Freq.Friday, name = 'Friday') %>%
add_trace(y = ~Freq.Saturday, name = 'Saturday') %>%
add_trace(y = ~Freq.Sunday, name = 'Sunday') %>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p
# 4. Based on your results, Based on your results, provide recommendations for Zynga to increase its monthly active usersBased on your results, provide recommendations for Zynga to increase its monthly active users 1. Based on the word cloud most of the game played that was played is Farmville, most of the gamers are excited about prizes, points and rewards, so it may benefit the company if they focus on these aspects when designing a game. 2. The sentiment analysis suggest that most people are feeling natural when discussing Zynga, but the positive tweets are twice much more as negative ones. 3. Finally, based on the histogram graph people are more likely to tweet about Zynga on saturday and sunday, and the tweets are more likely to be retweeted on friday, saturday and sunday therefore it may be beneficial to release big games these days.