require("twitteR")||install.packages("twitteR")
## Loading required package: twitteR
## [1] TRUE
require("base64enc")||install.packages("base64enc")
## Loading required package: base64enc
## [1] TRUE
require("tm")||install.packages("tm")
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
require("RTextTools")||install.packages("RTextTools")
## Loading required package: RTextTools
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## [1] TRUE
require("qdapRegex")||install.packages("qdapRegex")
## Loading required package: qdapRegex
## [1] TRUE
library(twitteR)
library(base64enc)
library(tm)
library(RTextTools)
library(qdapRegex)
api_key <- "ovvVsOEI4HQfZxhOfJZGajzIO" #Consumer key: *
api_secret <- "hQYN4iCH2ymxcwGCdb9CNB5u4TYIQZn86Nq7wfUmAYpnjpdwBZ" # Consumer secret: *
access_token <- "150991402-mGrUzS71VWPkqatCJAT53a7oCele7mzXGdP25SvO" # Access token:
access_token_secret <- "QlShB7a31kyumUpXBMS29yEulvznZLrXSwOxjnk01h4YN" # Access token secret:
# After this line of command type 1 for selection as Yes
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"
#combing the data frames elements from list.
HashTagTweetsCombined= do.call("rbind", totalTweets)
#dimension of the combined dataset
dim(HashTagTweetsCombined)
## [1] 5000 16
str(HashTagTweetsCombined)
## 'data.frame': 5000 obs. of 16 variables:
## $ text : chr "LYING Globalist GreenScum ManMade ClimateChange is fiction ALL recent Extremes are WRONG SORT for CO CON gthttp" "Elephants orangutans butterflies bees amp of the planets other species climatechange pollution poaching http" "Meet climatechange s apocalyptic twin oceans poisoned by plastic BreakFreeFromPlastic" "My on just went live cleanenergy ecomodernism poverty climatechange nu" ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ favoriteCount: num 0 0 0 0 0 0 0 0 0 0 ...
## $ replyToSN : chr NA NA NA NA ...
## $ created : POSIXct, format: "2016-12-18 08:26:57" "2016-12-18 08:26:29" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA NA NA NA ...
## $ id : chr "810400950703767552" "810400833372389376" "810400823226355712" "810400733598257152" ...
## $ replyToUID : chr NA NA NA NA ...
## $ statusSource : chr "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"https://roundteam.co\" rel=\"nofollow\">RoundTeam</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Mobile Web (M5)</a>" ...
## $ screenName : chr "lullabywoman" "GreenTechTrends" "punc14Audrey" "pronuclear" ...
## $ retweetCount : num 10 1 155 47 4 51 121 42 2 2 ...
## $ isRetweet : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
## $ retweeted : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ longitude : chr NA NA NA NA ...
## $ latitude : chr NA NA NA NA ...
#Cleaning the text : Removing URL's
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_url,pattern=pastex("@rm_twitter_url", "@rm_url"))
#removing emails from the tweet text(rm_email removes all the patterns which has @)
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_email)
#removes user hash tags from the tweet text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_tag)
#removes numbers from the text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_number)
#removes non ascii characters
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_non_ascii)
#removes extra white spaces
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_white)
#removes dates from the text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_date)
#As per the requirement manually labelling the csv file for sentiment and reloading again
data=read.csv("data2.csv")
data$sentiment=factor(data$sentiment)
set.seed(16102016) # To fix the sample
samp_id = sample(1:nrow(data), # do ?sample to examine the sample() func
round(nrow(data)*.70), # 70% records will be used for training
replace = F) # sampling without replacement.
train = data[samp_id,] # 70% of training data set, examine struc of samp_id obj
test = data[-samp_id,] # remaining 30% of training data set
dim(test) ; dim(train)
## [1] 342 3
## [1] 797 3
train.data = rbind(train,test) # join the data sets
train.data$text = tolower(train.data$text) # Convert to lower case
text = train.data$text
text = removePunctuation(text) # remove punctuation marks
text = removeNumbers(text) # remove numbers
text = stripWhitespace(text) # remove blank space
cor = Corpus(VectorSource(text)) # Create text corpus
dtm = DocumentTermMatrix(cor, # Craete DTM
control = list(weighting =
function(x)
weightTfIdf(x, normalize = F))) # IDF weighing
training_codes = train.data$sentiment # Coded labels
dim(dtm)
## [1] 1139 4201
After many iterations and testing with models like RF,TREE,Bagging,maxent,and slda found that GLMNET is giving higher accuracy.
container <- create_container(dtm, # creates a 'container' obj for training, classifying, and analyzing docs
t(training_codes), # labels or the Y variable / outcome we want to train on
trainSize = 1:nrow(train),
testSize = (nrow(train)+1):nrow(train.data),
virgin = FALSE) # whether to treat the classification data as 'virgin' data or not.
# if virgin = TRUE, then machine won;t borrow from prior datasets.
str(container) # view struc of the container obj; is a list of training n test data
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
## ..@ training_matrix :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:8435] 6.35 1.16 8.15 4.69 5.35 ...
## .. .. ..@ ja : int [1:8435] 323 602 834 1412 1501 1505 1800 2932 3588 310 ...
## .. .. ..@ ia : int [1:798] 1 10 16 25 33 45 57 70 73 82 ...
## .. .. ..@ dimension: int [1:2] 797 4201
## ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:3757] 1.16 8.15 5.07 8.15 6.98 ...
## .. .. ..@ ja : int [1:3757] 602 1399 2345 2842 67 166 172 602 1569 1586 ...
## .. .. ..@ ia : int [1:343] 1 5 18 26 33 45 55 64 74 78 ...
## .. .. ..@ dimension: int [1:2] 342 4201
## ..@ training_codes : Factor w/ 3 levels "negative","neutral",..: 2 2 2 2 3 2 2 3 3 2 ...
## ..@ testing_codes : Factor w/ 3 levels "negative","neutral",..: 3 3 2 2 3 3 3 2 2 2 ...
## ..@ column_names : chr [1:4201] "aan" "abandoning" "abc" "abd" ...
## ..@ virgin : logi FALSE
models <- train_models(container, # ?train_models; makes a model object using the specified algorithms.
algorithms=c("GLMNET")) #"MAXENT","SVM","GLMNET","SLDA","TREE","BAGGING","BOOSTING","RF"
results <- classify_models(container, models)
head(results)
## GLMNET_LABEL GLMNET_PROB
## 1 positive 0.7933092
## 2 positive 0.8806298
## 3 neutral 0.8800871
## 4 neutral 0.8614958
## 5 neutral 0.8244259
## 6 positive 0.9047524
out = data.frame(model_sentiment = results$GLMNET_LABEL, # rounded probability == model's prediction of Y
model_prob = results$GLMNET_PROB,
actual_sentiment = train.data$sentiment[(nrow(train)+1):nrow(train.data)]) # actual value of Y
dim(out); head(out);
## [1] 342 3
## model_sentiment model_prob actual_sentiment
## 1 positive 0.7933092 positive
## 2 positive 0.8806298 positive
## 3 neutral 0.8800871 neutral
## 4 neutral 0.8614958 neutral
## 5 neutral 0.8244259 positive
## 6 positive 0.9047524 positive
summary(out) # how many 0s and 1s were there anyway?
## model_sentiment model_prob actual_sentiment
## negative: 17 Min. :0.3776 negative: 44
## neutral :237 1st Qu.:0.7686 neutral :178
## positive: 88 Median :0.8541 positive:120
## Mean :0.8038
## 3rd Qu.:0.8658
## Max. :0.9995
(z = as.matrix(table(out[,1], out[,3]))) # display the confusion matrix.
##
## negative neutral positive
## negative 13 1 3
## neutral 26 172 39
## positive 5 5 78
(pct = round(((z[1,1] + z[2,2])/sum(z))*100, 2)) # prediction accuracy in % terms
## [1] 54.09
The hashtags chosen were are the most recent ones talked about climate change,demonetization in india , kejriwal the chief minister of delhi ,trump the newly elected president of united states of america.The hashtags chosen for the analysis after thorough cleaning contains around 1139 rows of which 563 are neutral rows , 389 are positve and rest 187 negative.The data set is dominated by neutral rows on a whole.The best predictive accuracy after trying many models like RF,Maxent,bagging,tree,boosting, slda , svm and glmnet.The GLMNET gave a maximum of the lot with 54 percent accuracy on the virgin data.