Task 2 : Training a machine to classify tweets according to sentiment

Loading the required libraries for the task

require("twitteR")||install.packages("twitteR")
## Loading required package: twitteR
## [1] TRUE
require("base64enc")||install.packages("base64enc")
## Loading required package: base64enc
## [1] TRUE
require("tm")||install.packages("tm")
## Loading required package: tm
## Loading required package: NLP
## [1] TRUE
require("RTextTools")||install.packages("RTextTools")
## Loading required package: RTextTools
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## [1] TRUE
require("qdapRegex")||install.packages("qdapRegex")
## Loading required package: qdapRegex
## [1] TRUE
library(twitteR)
library(base64enc)
library(tm)
library(RTextTools)
library(qdapRegex)

Authentication through api to get the data

api_key <- "ovvVsOEI4HQfZxhOfJZGajzIO"   #Consumer key: *

api_secret <- "hQYN4iCH2ymxcwGCdb9CNB5u4TYIQZn86Nq7wfUmAYpnjpdwBZ"   # Consumer secret: *

access_token <- "150991402-mGrUzS71VWPkqatCJAT53a7oCele7mzXGdP25SvO"  # Access token: 

access_token_secret <- "QlShB7a31kyumUpXBMS29yEulvznZLrXSwOxjnk01h4YN" # Access token secret: 

# After this line of command type 1 for selection as Yes 

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
## [1] "Using direct authentication"

Getting Tweets from the api and storing the data based on hashtags.

hashtags = c('#ClimateChange', '#Trump', '#Demonetization', '#Kejriwal','#Technology')
totalTweets= list()
for (hashtag in hashtags){
tweets = searchTwitter(hashtag, n=1000 )     # hash tag for tweets search and number of tweets
tweets = twListToDF(tweets)    # Convert from list to dataframe

tweets.df = tweets[,1]  # assign tweets for cleaning

tweets.df = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df);head(tweets.df) 

tweets.df = gsub("@\\w+", "", tweets.df);head(tweets.df) # regex for removing @user
tweets.df = gsub("[[:punct:]]", "", tweets.df);head(tweets.df) # regex for removing punctuation mark
tweets.df = gsub("[[:digit:]]", "", tweets.df);head(tweets.df) # regex for removing numbers
tweets.df = gsub("http\\w+", "", tweets.df);head(tweets.df) # regex for removing links
tweets.df = gsub("\n", " ", tweets.df);head(tweets.df)  ## regex for removing new line (\n)
tweets.df = gsub("[ \t]{2,}", " ", tweets.df);head(tweets.df) ## regex for removing two blank space
tweets.df =  gsub("[^[:alnum:]///' ]", " ", tweets.df)     # keep only alpha numeric 
tweets.df =  iconv(tweets.df, "latin1", "ASCII", sub="")   # Keep only ASCII characters
tweets.df = gsub("^\\s+|\\s+$", "", tweets.df);head(tweets.df)  # Remove leading and trailing white space
tweets[,1] = tweets.df # save in Data frame
totalTweets[[paste0(gsub('#','',hashtag))]]=tweets
}

Combining all the tweets into a single corpus.

#combing the data frames elements from list.
HashTagTweetsCombined= do.call("rbind", totalTweets)

#dimension of the combined dataset
dim(HashTagTweetsCombined)
## [1] 5000   16
str(HashTagTweetsCombined)
## 'data.frame':    5000 obs. of  16 variables:
##  $ text         : chr  "LYING Globalist GreenScum ManMade ClimateChange is fiction ALL recent Extremes are WRONG SORT for CO CON gthttp" "Elephants orangutans butterflies bees amp of the planets other species              climatechange pollution poaching http" "Meet climatechange s apocalyptic twin oceans poisoned by plastic BreakFreeFromPlastic" "My on just went live cleanenergy ecomodernism poverty climatechange nu" ...
##  $ favorited    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ favoriteCount: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ replyToSN    : chr  NA NA NA NA ...
##  $ created      : POSIXct, format: "2016-12-18 08:26:57" "2016-12-18 08:26:29" ...
##  $ truncated    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ replyToSID   : chr  NA NA NA NA ...
##  $ id           : chr  "810400950703767552" "810400833372389376" "810400823226355712" "810400733598257152" ...
##  $ replyToUID   : chr  NA NA NA NA ...
##  $ statusSource : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"https://roundteam.co\" rel=\"nofollow\">RoundTeam</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Mobile Web (M5)</a>" ...
##  $ screenName   : chr  "lullabywoman" "GreenTechTrends" "punc14Audrey" "pronuclear" ...
##  $ retweetCount : num  10 1 155 47 4 51 121 42 2 2 ...
##  $ isRetweet    : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
##  $ retweeted    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ longitude    : chr  NA NA NA NA ...
##  $ latitude     : chr  NA NA NA NA ...

Text preprocessing using the qdap regex package.

#Cleaning the text : Removing URL's

HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_url,pattern=pastex("@rm_twitter_url", "@rm_url"))

#removing emails from the tweet text(rm_email removes all the patterns which has @)
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_email)

#removes user hash tags from the tweet text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_tag)

#removes numbers from the text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_number)

#removes non ascii characters
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_non_ascii)

#removes extra white spaces
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_white)

#removes dates from the text
HashTagTweetsCombined$text=sapply(HashTagTweetsCombined$text,rm_date)

Storing the file in local location so as to manually label the polarity into neutral

positive and negative sentiments.

#As per the requirement manually labelling the csv file for sentiment and reloading again

data=read.csv("data2.csv")
data$sentiment=factor(data$sentiment)

Splitting the data into train and test and creating the document term matrix

set.seed(16102016)                          # To fix the sample 

samp_id = sample(1:nrow(data),              # do ?sample to examine the sample() func
                 round(nrow(data)*.70),     # 70% records will be used for training
                 replace = F)               # sampling without replacement.

train = data[samp_id,]                      # 70% of training data set, examine struc of samp_id obj
test = data[-samp_id,]                      # remaining 30% of training data set

dim(test) ; dim(train) 
## [1] 342   3
## [1] 797   3
train.data = rbind(train,test)              # join the data sets
train.data$text = tolower(train.data$text)  # Convert to lower case

text = train.data$text                      
text = removePunctuation(text)              # remove punctuation marks
text = removeNumbers(text)                  # remove numbers
text = stripWhitespace(text)                # remove blank space
cor = Corpus(VectorSource(text))            # Create text corpus
dtm = DocumentTermMatrix(cor,               # Craete DTM
                         control = list(weighting =             
                                          function(x)
                                            weightTfIdf(x, normalize = F))) # IDF weighing

training_codes = train.data$sentiment      # Coded labels
dim(dtm)
## [1] 1139 4201

Testing the models and choosing the best model

After many iterations and testing with models like RF,TREE,Bagging,maxent,and slda found that GLMNET is giving higher accuracy.

container <- create_container(dtm,               # creates a 'container' obj for training, classifying, and analyzing docs
                              t(training_codes), # labels or the Y variable / outcome we want to train on
                              trainSize = 1:nrow(train), 
                              testSize = (nrow(train)+1):nrow(train.data), 
                              virgin = FALSE)      # whether to treat the classification data as 'virgin' data or not.
# if virgin = TRUE, then machine won;t borrow from prior datasets.
str(container)     # view struc of the container obj; is a list of training n test data
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
##   ..@ training_matrix      :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:8435] 6.35 1.16 8.15 4.69 5.35 ...
##   .. .. ..@ ja       : int [1:8435] 323 602 834 1412 1501 1505 1800 2932 3588 310 ...
##   .. .. ..@ ia       : int [1:798] 1 10 16 25 33 45 57 70 73 82 ...
##   .. .. ..@ dimension: int [1:2] 797 4201
##   ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:3757] 1.16 8.15 5.07 8.15 6.98 ...
##   .. .. ..@ ja       : int [1:3757] 602 1399 2345 2842 67 166 172 602 1569 1586 ...
##   .. .. ..@ ia       : int [1:343] 1 5 18 26 33 45 55 64 74 78 ...
##   .. .. ..@ dimension: int [1:2] 342 4201
##   ..@ training_codes       : Factor w/ 3 levels "negative","neutral",..: 2 2 2 2 3 2 2 3 3 2 ...
##   ..@ testing_codes        : Factor w/ 3 levels "negative","neutral",..: 3 3 2 2 3 3 3 2 2 2 ...
##   ..@ column_names         : chr [1:4201] "aan" "abandoning" "abc" "abd" ...
##   ..@ virgin               : logi FALSE
models <- train_models(container,              # ?train_models; makes a model object using the specified algorithms.
                       algorithms=c("GLMNET")) #"MAXENT","SVM","GLMNET","SLDA","TREE","BAGGING","BOOSTING","RF"

results <- classify_models(container, models)

head(results)
##   GLMNET_LABEL GLMNET_PROB
## 1     positive   0.7933092
## 2     positive   0.8806298
## 3      neutral   0.8800871
## 4      neutral   0.8614958
## 5      neutral   0.8244259
## 6     positive   0.9047524

building a confusion matrix to see accuracy of prediction results

out = data.frame(model_sentiment = results$GLMNET_LABEL,    # rounded probability == model's prediction of Y
                 model_prob = results$GLMNET_PROB,
                 actual_sentiment = train.data$sentiment[(nrow(train)+1):nrow(train.data)])  # actual value of Y

dim(out); head(out); 
## [1] 342   3
##   model_sentiment model_prob actual_sentiment
## 1        positive  0.7933092         positive
## 2        positive  0.8806298         positive
## 3         neutral  0.8800871          neutral
## 4         neutral  0.8614958          neutral
## 5         neutral  0.8244259         positive
## 6        positive  0.9047524         positive
summary(out)           # how many 0s and 1s were there anyway?
##  model_sentiment   model_prob     actual_sentiment
##  negative: 17    Min.   :0.3776   negative: 44    
##  neutral :237    1st Qu.:0.7686   neutral :178    
##  positive: 88    Median :0.8541   positive:120    
##                  Mean   :0.8038                   
##                  3rd Qu.:0.8658                   
##                  Max.   :0.9995
(z = as.matrix(table(out[,1], out[,3])))   # display the confusion matrix.
##           
##            negative neutral positive
##   negative       13       1        3
##   neutral        26     172       39
##   positive        5       5       78
(pct = round(((z[1,1] + z[2,2])/sum(z))*100, 2))      # prediction accuracy in % terms
## [1] 54.09

Analysis

The hashtags chosen were are the most recent ones talked about climate change,demonetization in india , kejriwal the chief minister of delhi ,trump the newly elected president of united states of america.The hashtags chosen for the analysis after thorough cleaning contains around 1139 rows of which 563 are neutral rows , 389 are positve and rest 187 negative.The data set is dominated by neutral rows on a whole.The best predictive accuracy after trying many models like RF,Maxent,bagging,tree,boosting, slda , svm and glmnet.The GLMNET gave a maximum of the lot with 54 percent accuracy on the virgin data.