This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install necessary packages

# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
# install.packages('stringr')
# install.packages('stringi')
# install.packages('ggplot2')
# install.packages('plyr')
# install.packages('dplyr')
# install.packages('plotly')
# install.packages('reshape')
# install.packages('plotrix')


library('tm')

## Warning: package 'tm' was built under R version 3.4.4

## Loading required package: NLP

library('RColorBrewer')
library('wordcloud')

## Warning: package 'wordcloud' was built under R version 3.4.4

library('stringr')
library('ggplot2')

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library('plyr')
library('dplyr')

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library('stringi')
library('plotly')

## Warning: package 'plotly' was built under R version 3.4.4

## 
## Attaching package: 'plotly'

## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Process data

#Import Data from zynga.RDS
zynga <- readRDS("C:/Users/Mounika/Zynga.RDS")
zyngaTweets <- zynga$text

#********************************************
#         Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
  
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  # remove unicode 
  x = gsub("[^\x20-\x7E]", " ",x)
  return(x)
}

Create word cloud

zyngaTweets = clean.text(zyngaTweets)

#Create word cloud of tweets of zynga Users

corpus = Corpus(VectorSource(zyngaTweets))
# corpus = Corpus(VectorSource(cmail))
# create term-document matrix
tdm = TermDocumentMatrix(
  corpus,
  control = list(
    wordLengths=c(3,20), 
    removePunctuation = TRUE,
    stopwords = c("the", "a", stopwords("english")),
    removeNumbers = TRUE) )

# convert as matrix
tdm = as.matrix(tdm)

# get word counts in decreasing order
word_freqs = sort(rowSums(tdm), decreasing=TRUE) 

# create a data frame with words and their frequencies
dm = data.frame(word=names(word_freqs), freq=word_freqs)

#remove the top words which donât generate insights such as "the", "a", "and", etc.
word_freqs = word_freqs[-(1:9)]  #Here â1â is 1st word in the list we want to remove 


#Plot corpus in a clored graph; need RColorBrewer package

wordcloud(head(dm$word, 100), head(dm$freq, 100), random.order=FALSE, colors=brewer.pal(8, "Dark2"))

Word Data Base: (positive & negative words) R - Script:

posText <- read.delim("C:/Users/Mounika/pos.words.txt",
 header=FALSE, stringsAsFactors=FALSE)
posText <- posText$V1
posText <- unlist(lapply(posText, function(x) { strsplit(x, "\n") }))
negText <- read.delim("C:/Users/Mounika/neg.words.txt",
 header=FALSE, stringsAsFactors=FALSE)
negText <- negText$V1
negText <- unlist(lapply(negText, function(x) { strsplit(x, "\n") }))
pos.words = c(posText, 'congrats', 'prizes', 'prize', 'thanks', 'thnx',
 'Grt', 'gr8', 'plz', 'trending', 'recovering', 'brainstrom', 'leader')
neg.words = c(negText, 'Fight', 'fighting', 'waiting','epicfail',
 'mechanical', 'wtf', 'arrest', 'no', 'Tnot')

Lexical Analysis: (to calculate positive score, negative score and overall score) R- Script:

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
 require(plyr)
 require(stringr)
 list=lapply(sentences, function(sentence, pos.words, neg.words)
 {
 sentence = gsub('[[:punct:]]',' ',sentence)
 sentence = gsub('[[:cntrl:]]','',sentence)
 sentence = gsub('\\d+','',sentence) #removes decimal number
 sentence = gsub('\n','',sentence) #removes new lines
 sentence = tolower(sentence)
 word.list = str_split(sentence, '\\s+')
 words = unlist(word.list) #changes a list to character vector
 pos.matches = match(words, pos.words)
 neg.matches = match(words, neg.words)
 pos.matches = !is.na(pos.matches)
 neg.matches = !is.na(neg.matches)
 pp = sum(pos.matches)
 nn = sum(neg.matches)
 score = sum(pos.matches) - sum(neg.matches)
 list1 = c(score, pp, nn)
 return (list1)
}, pos.words, neg.words)
 score_new = lapply(list, '[[', 1)
 pp1 = lapply(list, '[[',2)
 nn1 = lapply(list, '[[',3)
 scores.df = data.frame(score = score_new, text=sentences)
 positive.df = data.frame(Positive = pp1, text=sentences)
 negative.df = data.frame(Negative = nn1, text=sentences)
 list_df = list(scores.df, positive.df, negative.df)
 return(list_df)
}

Functional R-script for Lexical Analysis:

#Cleans the tweets and returns merged data frame
result = score.sentiment(zyngaTweets, pos.words, neg.words)
library(reshape)

## Warning: package 'reshape' was built under R version 3.4.4

## 
## Attaching package: 'reshape'

## The following object is masked from 'package:plotly':
## 
##     rename

## The following object is masked from 'package:dplyr':
## 
##     rename

## The following objects are masked from 'package:plyr':
## 
##     rename, round_any

#create a copy of result data frame
test1 = result[[1]]
test2 = result[[2]]
test3 = result[[3]]
test1$text = NULL
test2$text = NULL
test3$text = NULL
q1 = test1[1,]
q2 = test2[1,]
q3 = test3[1,]
qq1 = melt(q1, , var='Score')

## Using  as id variables

qq2 = melt(q2, , var='Positive')

## Using  as id variables

qq3 = melt(q3, , var='Negative')

## Using  as id variables

qq1['Score'] = NULL
qq2['Positive'] = NULL
qq3['Negative'] = NULL
table1 = data.frame(Text=result[[1]]$text, Score=qq1)
table2 = data.frame(Text=result[[2]]$text, Score=qq2)
table3 = data.frame(Text=result[[3]]$text, Score=qq3)
#Merge the tables
table_final = data.frame(Text = table1$Text,Score = table1$value,
 Positive = table2$value, Negative=table3$value)
table_final

Positive & Negative percentage R-Script:

#Positive Percentage
#Renaming
posSc=table_final$Positive
negSc=table_final$Negative
#Adding column
table_final$PosPercent =posSc/ (posSc+negSc)
#Replacing Nan with zero
pp = table_final$PosPercent
pp[is.nan(pp)] <- 0
table_final$PosPercent = pp
#Negative Percentage
#Adding column
table_final$NegPercent = negSc/ (posSc+negSc)
#Replacing Nan with zero
nn = table_final$NegPercent
nn[is.nan(nn)] <- 0
table_final$NegPercent = nn

Histogram chart Zynga user’s Sentiment R-Script:

#Histogram 
hist(table_final$Positive, col=rainbow(10))

hist(table_final$Negative, col=rainbow(10))

Pie chart Zynga User’s Sentiment R-Script:

slices <- c(sum(table_final$Positive), sum(table_final$Negative))
labels <- c("positive", "Negative")
library(plotrix)
pie(slices, labels = labels, col=rainbow(length(labels)), main="Sentiment Analysis")

Midterm Exam