1. Installing and loading packages

install.packages('RTextTools')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'RTextTools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('e1071')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('readr')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('tidyverse')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('text2vec')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'text2vec' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('ggrepel')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'ggrepel' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('purrrlyr')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'purrrlyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('caret')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('glmnet')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('ggplot2')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
library(purrrlyr)
## Warning: package 'purrrlyr' was built under R version 3.4.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.8.0     v stringr 1.3.0
## v readr   1.1.1     v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(text2vec)
## Warning: package 'text2vec' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.4
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-16
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.4.4
library(ggplot2)

2. Importing traning and testing data

tweets_classified <- readRDS("C:/Users/Mounika/tweets_classified.RDS")

# data splitting on train and test by 0.8 / 0.2
set.seed(2340)
trainIndex <- createDataPartition(tweets_classified$sentiment, p = 0.8, 
                                  list = FALSE, 
                                  times = 1)
tweets_train <- tweets_classified[trainIndex, ]
tweets_test <- tweets_classified[-trainIndex, ]

3. doc to vec

# define preprocessing function and tokenization function
prep_fun <- tolower
tok_fun <- word_tokenizer

it_train <- itoken(tweets_train$text, 
                   preprocessor = prep_fun, 
                   tokenizer = tok_fun,
                   ids = tweets_train$id,
                   progressbar = TRUE)
it_test <- itoken(tweets_test$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun,
                  ids = tweets_test$id,
                  progressbar = TRUE)

creating vocabulary and document-term matrix

# creating vocabulary and document-term matrix
vocab <- create_vocabulary(it_train)
## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%
vectorizer <- vocab_vectorizer(vocab)
dtm_train <- create_dtm(it_train, vectorizer)
## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%
# define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf  <- create_dtm(it_test, vectorizer) %>% 
        transform(tfidf)
## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

4. Train the model

t1 <- Sys.time()
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
                               y = tweets_train[['sentiment']], 
                               family = 'binomial', 
                               # L1 penalty
                               alpha = 1,
                               # interested in the area under ROC curve
                               type.measure = "auc",
                               # 5-fold cross-validation
                               nfolds = 5,
                               # high value is less accurate, but has faster training
                               thresh = 1e-3,
                               # again lower number of iterations for faster training
                               maxit = 1e3)
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -51); Convergence for 51th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -50); Convergence for 50th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -50); Convergence for 50th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
print(difftime(Sys.time(), t1, units = 'mins'))
## Time difference of 41.50731 mins

5. Plot the results

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.8767"
saveRDS(glmnet_classifier, 'glmnet_classifier.RDS')

Generate sentiment score with the Machine Learning model

Samsung <- readRDS("C:/Users/Mounika/Samsung.RDS")
SamsungTweets <- Samsung$tweettext

it_tweets <- itoken(SamsungTweets,
                    preprocessor = prep_fun,
                    tokenizer = tok_fun,
                    # ids = Trump$X1,
                    progressbar = TRUE)

dtm_tweets <- create_dtm(it_tweets, vectorizer)
## 
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |=================================================================| 100%

transforming data with tf-idf

dtm_tweets_tfidf <- fit_transform(dtm_tweets, tfidf)

glmnet_classifier <- readRDS('glmnet_classifier.RDS')

predict probabilities of positiveness

preds_tweets <- predict(glmnet_classifier, dtm_tweets_tfidf, type = 'response')[ ,1]

Samsung$sentiment <- preds_tweets
# color palette
cols <- c("#ce472e", "#f05336", "#ffd73e", "#eec73a", "#4ab04a")

set.seed(932)
samp_ind <- sample(c(1:nrow(Samsung)), nrow(Samsung) * 0.1) # 10% for labeling

# plotting
ggplot(Samsung, aes(x = screenName, y = sentiment, color = sentiment)) +
  theme_minimal() +
  scale_color_gradientn(colors = cols, limits = c(0, 1),
                        breaks = seq(0, 1, by = 1/4),
                        labels = c("0", round(1/4*1, 1), round(1/4*2, 1), round(1/4*3, 1), round(1/4*4, 1)),
                        guide = guide_colourbar(ticks = T, nbin = 50, barheight = .5, label = T, barwidth = 10)) +
  geom_point(aes(color = sentiment), alpha = 0.8) +
  geom_hline(yintercept = 0.65, color = "#4ab04a", size = 1.5, alpha = 0.6, linetype = "longdash") +
  geom_hline(yintercept = 0.35, color = "#f05336", size = 1.5, alpha = 0.6, linetype = "longdash") +
  geom_smooth(size = 1.2, alpha = 0.2) +
  geom_label_repel(data = Samsung[samp_ind, ],
                   aes(label = round(sentiment, 2)),
                   fontface = 'bold',
                   size = 2.5,
                   max.iter = 100) +
  theme(legend.position = 'bottom',
        legend.direction = "horizontal",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = 20, face = "bold", vjust = 2, color = 'black', lineheight = 0.8),
        axis.title.x = element_text(size = 16),
        axis.title.y = element_text(size = 16),
        axis.text.y = element_text(size = 8, face = "bold", color = 'black'),
        axis.text.x = element_text(size = 8, face = "bold", color = 'black')) +
  ggtitle("Tweets Sentiment rate (probability of positiveness)")
## `geom_smooth()` using method = 'loess'

# Install required packages

# install the necessary packages
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')

Clean data

library(readr)
gear <- read.csv("C:/Users/Mounika/Samsung_Tweets.csv", row.names=1, sep=";")
geartweets <- gear$tweettext

#********************************************
#         Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
  # tolower
  x = tolower(x)
  # remove rt
  x = gsub("rt", "", x)
  # remove at
  x = gsub("@\\w+", "", x)
  # remove punctuation
  x = gsub("[[:punct:]]", "", x)
  # remove numbers
  x = gsub("[[:digit:]]", "", x)
  # remove links http
  x = gsub("http\\w+", "", x)
  # remove tabs
  x = gsub("[ |\t]{2,}", "", x)
  # remove blank spaces at the beginning
  x = gsub("^ ", "", x)
  # remove blank spaces at the end
  x = gsub(" $", "", x)
  return(x)
}

# clean tweets
geartweets = clean.text(geartweets)

Topic Analysis

sports.words = scan('C:/Users/Mounika/Sports_Word.txt', what='character',comment.char=';')

score.topic = function(sentences, dict, .progress='none')
{
  require(plyr)
  require(stringr)
  require(stringi)
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, dict) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    topic.matches = match(words, dict)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    topic.matches = !is.na(topic.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(topic.matches)
    
    return(score)
  }, dict, .progress=.progress )
  
  topicscores.df = data.frame(score=scores, text=sentences)
  return(topicscores.df)
}

topic.scores = score.topic(geartweets, sports.words, .progress='none')
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.4.4
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:purrr':
## 
##     compact
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4

Naive Approach

Week 5 Sentiment Analysis

pos.words = scan('C:/Users/Mounika/positive-words.txt', what='character',comment.char=';')
neg.words = scan('C:/Users/Mounika/negative-words.txt', what='character',comment.char=';')

neg.words = c(neg.words, 'wtf', 'fail')


#Implementing our sentiment scoring algorithm
require(plyr)
require(stringr)
require(stringi)

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
  
  # we got a vector of sentences. plyr will handle a list
  # or a vector as an "l" for us
  # we want a simple array of scores back, so we use
  # "l" + "a" + "ply" = "laply":
  scores = laply(sentences, function(sentence, pos.words, neg.words) {
    
    # clean up sentences with R's regex-driven global substitute, gsub():
    sentence = gsub('[[:punct:]]', '', sentence)
    sentence = gsub('[[:cntrl:]]', '', sentence)
    sentence = gsub('\\d+', '', sentence)
    # and convert to lower case:
    sentence = tolower(sentence)
    
    # split into words. str_split is in the stringr package
    word.list = str_split(sentence, '\\s+')
    # sometimes a list() is one level of hierarchy too much
    words = unlist(word.list)
    
    # compare our words to the dictionaries of positive & negative terms
    pos.matches = match(words, pos.words)
    neg.matches = match(words, neg.words)
    
    # match() returns the position of the matched term or NA
    # we just want a TRUE/FALSE:
    pos.matches = !is.na(pos.matches)
    neg.matches = !is.na(neg.matches)
    
    # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
    score = sum(pos.matches) - sum(neg.matches)
    
    return(score)
  }, pos.words, neg.words, .progress=.progress )
  
  scores.df = data.frame(score=scores, text=sentences)
  return(scores.df)
}

Pie chart of tweets mentioning sports

topic.negative = subset(sentiment.scores, score < 0)
topic.positive = subset(sentiment.scores, score > 0)
topic.neutral = subset(sentiment.scores, score = 0)

Negative = nrow(topic.negative)
Positive = nrow(topic.positive)
Netural = nrow(topic.neutral)

dftemp=data.frame(topic=c("Negative", "Positive","Netural"), 
                  number=c(Negative,Positive,Netural))

library(plotly)
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
  layout(title = 'Pie chart to show the percentage of positive, negative, and neutral tweets',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p

Results

  1. R script to train a GLMnet model using 1.6 million labeled Twitter tweets Result: Sentiment Max. Accuracy of 87.67% (Positivity).
  2. R notebook of Week 5 (the naive approach) Result: 74% Positivity
  3. Compare the sentiment scores of the two methods, Result: ML sentiment score is more accurate compared to naive approach ```