install.packages('RTextTools')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'RTextTools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('e1071')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'e1071' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('readr')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('tidyverse')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('text2vec')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'text2vec' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('ggrepel')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'ggrepel' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('purrrlyr')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'purrrlyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('caret')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('glmnet')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'glmnet' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
install.packages('ggplot2')
## Installing package into 'C:/Users/Mounika/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Mounika\AppData\Local\Temp\Rtmpy0pkKv\downloaded_packages
library(purrrlyr)
## Warning: package 'purrrlyr' was built under R version 3.4.4
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.4
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.3.0
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'readr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.4
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(text2vec)
## Warning: package 'text2vec' was built under R version 3.4.4
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.4
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-16
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 3.4.4
library(ggplot2)
tweets_classified <- readRDS("C:/Users/Mounika/tweets_classified.RDS")
# data splitting on train and test by 0.8 / 0.2
set.seed(2340)
trainIndex <- createDataPartition(tweets_classified$sentiment, p = 0.8,
list = FALSE,
times = 1)
tweets_train <- tweets_classified[trainIndex, ]
tweets_test <- tweets_classified[-trainIndex, ]
# define preprocessing function and tokenization function
prep_fun <- tolower
tok_fun <- word_tokenizer
it_train <- itoken(tweets_train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_train$id,
progressbar = TRUE)
it_test <- itoken(tweets_test$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_test$id,
progressbar = TRUE)
# creating vocabulary and document-term matrix
vocab <- create_vocabulary(it_train)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
vectorizer <- vocab_vectorizer(vocab)
dtm_train <- create_dtm(it_train, vectorizer)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
# define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf <- create_dtm(it_test, vectorizer) %>%
transform(tfidf)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
t1 <- Sys.time()
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = tweets_train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -51); Convergence for 51th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -50); Convergence for 50th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -50); Convergence for 50th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
## Warning: from glmnet Fortran code (error code -52); Convergence for 52th
## lambda value not reached after maxit=1000 iterations; solutions for larger
## lambdas returned
print(difftime(Sys.time(), t1, units = 'mins'))
## Time difference of 41.50731 mins
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.8767"
saveRDS(glmnet_classifier, 'glmnet_classifier.RDS')
Samsung <- readRDS("C:/Users/Mounika/Samsung.RDS")
SamsungTweets <- Samsung$tweettext
it_tweets <- itoken(SamsungTweets,
preprocessor = prep_fun,
tokenizer = tok_fun,
# ids = Trump$X1,
progressbar = TRUE)
dtm_tweets <- create_dtm(it_tweets, vectorizer)
##
|
|====== | 10%
|
|============= | 20%
|
|==================== | 30%
|
|========================== | 40%
|
|================================ | 50%
|
|======================================= | 60%
|
|============================================== | 70%
|
|==================================================== | 80%
|
|========================================================== | 90%
|
|=================================================================| 100%
dtm_tweets_tfidf <- fit_transform(dtm_tweets, tfidf)
glmnet_classifier <- readRDS('glmnet_classifier.RDS')
preds_tweets <- predict(glmnet_classifier, dtm_tweets_tfidf, type = 'response')[ ,1]
Samsung$sentiment <- preds_tweets
# color palette
cols <- c("#ce472e", "#f05336", "#ffd73e", "#eec73a", "#4ab04a")
set.seed(932)
samp_ind <- sample(c(1:nrow(Samsung)), nrow(Samsung) * 0.1) # 10% for labeling
# plotting
ggplot(Samsung, aes(x = screenName, y = sentiment, color = sentiment)) +
theme_minimal() +
scale_color_gradientn(colors = cols, limits = c(0, 1),
breaks = seq(0, 1, by = 1/4),
labels = c("0", round(1/4*1, 1), round(1/4*2, 1), round(1/4*3, 1), round(1/4*4, 1)),
guide = guide_colourbar(ticks = T, nbin = 50, barheight = .5, label = T, barwidth = 10)) +
geom_point(aes(color = sentiment), alpha = 0.8) +
geom_hline(yintercept = 0.65, color = "#4ab04a", size = 1.5, alpha = 0.6, linetype = "longdash") +
geom_hline(yintercept = 0.35, color = "#f05336", size = 1.5, alpha = 0.6, linetype = "longdash") +
geom_smooth(size = 1.2, alpha = 0.2) +
geom_label_repel(data = Samsung[samp_ind, ],
aes(label = round(sentiment, 2)),
fontface = 'bold',
size = 2.5,
max.iter = 100) +
theme(legend.position = 'bottom',
legend.direction = "horizontal",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.title = element_text(size = 20, face = "bold", vjust = 2, color = 'black', lineheight = 0.8),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.y = element_text(size = 8, face = "bold", color = 'black'),
axis.text.x = element_text(size = 8, face = "bold", color = 'black')) +
ggtitle("Tweets Sentiment rate (probability of positiveness)")
## `geom_smooth()` using method = 'loess'
# Install required packages
# install the necessary packages
# install.packages("readr")
# install.packages("plyr")
# install.packages("stringr")
# install.packages("stringi")
# install.packages("magrittr")
# install.packages("dplyr")
# install.packages('tm')
# install.packages('RColorBrewer')
# install.packages('wordcloud')
library(readr)
gear <- read.csv("C:/Users/Mounika/Samsung_Tweets.csv", row.names=1, sep=";")
geartweets <- gear$tweettext
#********************************************
# Clean tweets
#********************************************
#use this function to clean the tweets
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}
# clean tweets
geartweets = clean.text(geartweets)
sports.words = scan('C:/Users/Mounika/Sports_Word.txt', what='character',comment.char=';')
score.topic = function(sentences, dict, .progress='none')
{
require(plyr)
require(stringr)
require(stringi)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, dict) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
topic.matches = match(words, dict)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
topic.matches = !is.na(topic.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(topic.matches)
return(score)
}, dict, .progress=.progress )
topicscores.df = data.frame(score=scores, text=sentences)
return(topicscores.df)
}
topic.scores = score.topic(geartweets, sports.words, .progress='none')
## Loading required package: plyr
## Warning: package 'plyr' was built under R version 3.4.4
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Loading required package: stringi
## Warning: package 'stringi' was built under R version 3.4.4
pos.words = scan('C:/Users/Mounika/positive-words.txt', what='character',comment.char=';')
neg.words = scan('C:/Users/Mounika/negative-words.txt', what='character',comment.char=';')
neg.words = c(neg.words, 'wtf', 'fail')
#Implementing our sentiment scoring algorithm
require(plyr)
require(stringr)
require(stringi)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
topic.negative = subset(sentiment.scores, score < 0)
topic.positive = subset(sentiment.scores, score > 0)
topic.neutral = subset(sentiment.scores, score = 0)
Negative = nrow(topic.negative)
Positive = nrow(topic.positive)
Netural = nrow(topic.neutral)
dftemp=data.frame(topic=c("Negative", "Positive","Netural"),
number=c(Negative,Positive,Netural))
library(plotly)
p <- plot_ly(data=dftemp, labels = ~topic, values = ~number, type = 'pie') %>%
layout(title = 'Pie chart to show the percentage of positive, negative, and neutral tweets',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p