http://rpubs.com/neerajkhattar/Classifytweets

Task 2 - Training a machine to classify tweets according to sentiment

Library needed for executing the code…

require("twitteR")||install.packages("twitteR")
require("base64enc")||install.packages("base64enc")
require("tm")||install.packages("rJava")
require("RTextTools")||install.packages("RTextTools")
require("magrittr")||install.packages("magrittr")

library("twitteR")
library("base64enc")
library("tm")
library("RTextTools")
library("magrittr")

Step 1 - Choose any six different recent twitter hashtags with or without sentiment

We have picked the following six words to complete this task

  1. axisbank

  2. flipkart

  3. Paytm

  4. narendramodi

  5. rahulgandhi

  6. demonetisation

#############################################
# Commenting the below code as this require connection to Twitter and asscoiated personal keys...
# Authentication
#############################################

#api_key <- "XXXXXXXXXXXXXXXXXX"   #Consumer key: *

#api_secret <- "XXXXXXXXXXXXXXXXXX"   # Consumer secret: *

#access_token <- "XXXXXXXXXXXXXXXXXX"  # Access token: 

#access_token_secret <- "XXXXXXXXXXXXXXXXXX" # Access token secret: 

# After this line of command type 1 for selection as Yes 

#setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)


#############################################
# Extract Tweets
#############################################

#hashtags = c('#axisbank', '#flipkart', '#Paytm', '#narendramodi', '#rahulgandhi', '#demonetisation')

#for (hashtag in hashtags){
#  tweets = searchTwitter(hashtag, n=10 )     # hash tag for tweets search and number of tweets
#  tweets = twListToDF(tweets)    # Convert from list to dataframe
#  View(tweets)
#  tweets.df = tweets[,1]  # assign tweets for cleaning

#  tweets.df = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df);head(tweets.df) 
  
#  tweets.df = gsub("@\\w+", "", tweets.df);head(tweets.df) # regex for removing @user
#  tweets.df = gsub("[[:punct:]]", "", tweets.df);head(tweets.df) # regex for removing punctuation mark
#  tweets.df = gsub("[[:digit:]]", "", tweets.df);head(tweets.df) # regex for removing numbers
#  tweets.df = gsub("http\\w+", "", tweets.df);head(tweets.df) # regex for removing links
#  tweets.df = gsub("\n", " ", tweets.df);head(tweets.df)  ## regex for removing new line (\n)
#  tweets.df = gsub("[ \t]{2,}", " ", tweets.df);head(tweets.df) ## regex for removing two blank space
#  tweets.df =  gsub("[^[:alnum:]///' ]", " ", tweets.df)     # keep only alpha numeric 
#  tweets.df =  iconv(tweets.df, "latin1", "ASCII", sub="")   # Keep only ASCII characters
#  tweets.df = gsub("^\\s+|\\s+$", "", tweets.df);head(tweets.df)  # Remove leading and trailing white space
#  tweets[,1] = tweets.df # save in Data frame

#  head(tweets)

#  write.csv(tweets,paste0(gsub('#','',hashtag),'.csv'))

#}

The next code snippet will perform the following task:

  1. Make a unique tweets corpus (~ 2500) out of ~ 3000 tweets. Drop duplicates due to retweets etc.

  2. Randomly select 70% tweets (training data) and classify them manually as positive(1), neutral (0) or negative(-1).

  3. From this training data, build a simple classifier model (as we did in the simple classwork exercise). Split the sample into two-thirds (calibration) and one thirds (holdout) and check the prediction accuracy of the model. Build its confusion matrix.

## === 1. Text classification based on sentiments of text === ##

# Step 1- Read the training data set in R #

data = read.csv(file.choose(), stringsAsFactors = F)
dim(data)
## [1] 2847    3
which(data$classfication <1) %>%      # view some negative sentiment docs too; using pipe operator
  data[.,] %>%                    # note use of '.' in subsetting ops when using %>%
    head()                        # i.e., head rows of data in which sentiment col is < 1.
## [1] id             text           classification
## <0 rows> (or 0-length row.names)
# Step 2- Split this data in two parts for evaluating models
set.seed(16102016)                          # To fix the sample 

samp_id = sample(1:nrow(data),              # do ?sample to examine the sample() func
                 round(nrow(data)*.70),     # 70% records will be used for training
                 replace = F)               # sampling without replacement.

train = data[samp_id,]                      # 70% of training data set, examine struc of samp_id obj
test = data[-samp_id,]                      # remaining 30% of training data set

dim(test) ; dim(train)                      # dimns of test n training 
## [1] 854   3
## [1] 1993    3
# Step 3- Process the text data and create DTM (Document Term Matrix)
train.data = rbind(train,test)              # join the data sets
train.data$text = tolower(train.data$text)  # Convert to lower case

text = train.data$text                      
text = removePunctuation(text)              # remove punctuation marks
text = removeNumbers(text)                  # remove numbers
text = stripWhitespace(text)                # remove blank space
cor = Corpus(VectorSource(text))            # Create text corpus
dtm = DocumentTermMatrix(cor,               # Craete DTM
                         control = list(weighting =             
                                               function(x)
                                                 weightTfIdf(x, normalize = F))) # IDF weighing

training_codes = train.data$classification       # Coded labels
dim(dtm)
## [1] 2847 2916

The next code snippet will perform the following task:

Try changing the pre-processing a few times - dropping most common and uninformative words using the stopwords filter, for instance. Does it affect prediction accuracy?

# Step 4- Test the models and choose best model
container <- create_container(dtm,               # creates a 'container' obj for training, classifying, and analyzing docs
                              t(training_codes), # labels or the Y variable / outcome we want to train on
                              trainSize = 1:nrow(train), 
                              testSize = (nrow(train)+1):nrow(train.data), 
                              virgin = FALSE)      # whether to treat the classification data as 'virgin' data or not.
                                                   # if virgin = TRUE, then machine won;t borrow from prior datasets.
str(container)     # view struc of the container obj; is a list of training n test data
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
##   ..@ training_matrix      :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:21447] 4.09 6.43 7.31 3.09 7.57 ...
##   .. .. ..@ ja       : int [1:21447] 148 278 811 976 1138 1176 1529 1820 1857 2009 ...
##   .. .. ..@ ia       : int [1:1994] 1 18 27 39 50 58 72 83 88 102 ...
##   .. .. ..@ dimension: int [1:2] 1993 2916
##   ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
##   .. .. ..@ ra       : num [1:9116] 9.89 4.26 9.89 7.77 9.89 ...
##   .. .. ..@ ja       : int [1:9116] 39 83 131 183 617 764 1110 1816 1820 1909 ...
##   .. .. ..@ ia       : int [1:855] 1 20 28 36 49 66 74 91 100 111 ...
##   .. .. ..@ dimension: int [1:2] 854 2916
##   ..@ training_codes       : Factor w/ 3 levels "-1","0","1": 1 3 2 3 1 3 1 2 1 1 ...
##   ..@ testing_codes        : Factor w/ 3 levels "-1","0","1": 3 2 1 1 3 3 1 1 2 3 ...
##   ..@ column_names         : chr [1:2916] "aadmi" "aaj" "aam" "aamir" ...
##   ..@ virgin               : logi FALSE
models <- train_models(container,              # ?train_models; makes a model object using the specified algorithms.
                       algorithms=c("MAXENT")) #"MAXENT","SVM","GLMNET","SLDA","TREE","BAGGING","BOOSTING","RF"

results <- classify_models(container, models)

head(results)
##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1                1               1
## 2                0               1
## 3               -1               1
## 4               -1               1
## 5                1               1
## 6                1               1
names(train.data)
## [1] "id"             "text"           "classification"
# building a confusion matrix to see accuracy of prediction results
out = data.frame(model_sentiment = results$MAXENTROPY_LABEL,    # rounded probability == model's prediction of Y
                 model_prob = results$MAXENTROPY_PROB,
                 actual_sentiment = train.data$classification[(nrow(train)+1):nrow(train.data)])  # actual value of Y

dim(out); head(out); 
## [1] 854   3
##   model_sentiment model_prob actual_sentiment
## 1               1          1                1
## 2               0          1                0
## 3              -1          1               -1
## 4              -1          1               -1
## 5               1          1                1
## 6               1          1                1
summary(out)           # how many 0s and 1s were there anyway?
##  model_sentiment   model_prob     actual_sentiment  
##  -1:338          Min.   :0.5001   Min.   :-1.00000  
##  0 :183          1st Qu.:1.0000   1st Qu.:-1.00000  
##  1 :333          Median :1.0000   Median : 0.00000  
##                  Mean   :0.9818   Mean   :-0.03279  
##                  3rd Qu.:1.0000   3rd Qu.: 1.00000  
##                  Max.   :1.0000   Max.   : 1.00000
(z = as.matrix(table(out[,1], out[,3])))   # display the confusion matrix.
##     
##       -1   0   1
##   -1 329   4   5
##   0    5 175   3
##   1   16   3 314
(pct = round(((z[1,1] + z[2,2])/sum(z))*100, 2))      # prediction accuracy in % terms
## [1] 59.02