require("twitteR")||install.packages("twitteR")
require("base64enc")||install.packages("base64enc")
require("tm")||install.packages("rJava")
require("RTextTools")||install.packages("RTextTools")
require("magrittr")||install.packages("magrittr")
library("twitteR")
library("base64enc")
library("tm")
library("RTextTools")
library("magrittr")
#############################################
# Commenting the below code as this require connection to Twitter and asscoiated personal keys...
# Authentication
#############################################
#api_key <- "XXXXXXXXXXXXXXXXXX" #Consumer key: *
#api_secret <- "XXXXXXXXXXXXXXXXXX" # Consumer secret: *
#access_token <- "XXXXXXXXXXXXXXXXXX" # Access token:
#access_token_secret <- "XXXXXXXXXXXXXXXXXX" # Access token secret:
# After this line of command type 1 for selection as Yes
#setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
#############################################
# Extract Tweets
#############################################
#hashtags = c('#axisbank', '#flipkart', '#Paytm', '#narendramodi', '#rahulgandhi', '#demonetisation')
#for (hashtag in hashtags){
# tweets = searchTwitter(hashtag, n=10 ) # hash tag for tweets search and number of tweets
# tweets = twListToDF(tweets) # Convert from list to dataframe
# View(tweets)
# tweets.df = tweets[,1] # assign tweets for cleaning
# tweets.df = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets.df);head(tweets.df)
# tweets.df = gsub("@\\w+", "", tweets.df);head(tweets.df) # regex for removing @user
# tweets.df = gsub("[[:punct:]]", "", tweets.df);head(tweets.df) # regex for removing punctuation mark
# tweets.df = gsub("[[:digit:]]", "", tweets.df);head(tweets.df) # regex for removing numbers
# tweets.df = gsub("http\\w+", "", tweets.df);head(tweets.df) # regex for removing links
# tweets.df = gsub("\n", " ", tweets.df);head(tweets.df) ## regex for removing new line (\n)
# tweets.df = gsub("[ \t]{2,}", " ", tweets.df);head(tweets.df) ## regex for removing two blank space
# tweets.df = gsub("[^[:alnum:]///' ]", " ", tweets.df) # keep only alpha numeric
# tweets.df = iconv(tweets.df, "latin1", "ASCII", sub="") # Keep only ASCII characters
# tweets.df = gsub("^\\s+|\\s+$", "", tweets.df);head(tweets.df) # Remove leading and trailing white space
# tweets[,1] = tweets.df # save in Data frame
# head(tweets)
# write.csv(tweets,paste0(gsub('#','',hashtag),'.csv'))
#}
The next code snippet will perform the following task:
Make a unique tweets corpus (~ 2500) out of ~ 3000 tweets. Drop duplicates due to retweets etc.
Randomly select 70% tweets (training data) and classify them manually as positive(1), neutral (0) or negative(-1).
From this training data, build a simple classifier model (as we did in the simple classwork exercise). Split the sample into two-thirds (calibration) and one thirds (holdout) and check the prediction accuracy of the model. Build its confusion matrix.
## === 1. Text classification based on sentiments of text === ##
# Step 1- Read the training data set in R #
data = read.csv(file.choose(), stringsAsFactors = F)
dim(data)
## [1] 2847 3
which(data$classfication <1) %>% # view some negative sentiment docs too; using pipe operator
data[.,] %>% # note use of '.' in subsetting ops when using %>%
head() # i.e., head rows of data in which sentiment col is < 1.
## [1] id text classification
## <0 rows> (or 0-length row.names)
# Step 2- Split this data in two parts for evaluating models
set.seed(16102016) # To fix the sample
samp_id = sample(1:nrow(data), # do ?sample to examine the sample() func
round(nrow(data)*.70), # 70% records will be used for training
replace = F) # sampling without replacement.
train = data[samp_id,] # 70% of training data set, examine struc of samp_id obj
test = data[-samp_id,] # remaining 30% of training data set
dim(test) ; dim(train) # dimns of test n training
## [1] 854 3
## [1] 1993 3
# Step 3- Process the text data and create DTM (Document Term Matrix)
train.data = rbind(train,test) # join the data sets
train.data$text = tolower(train.data$text) # Convert to lower case
text = train.data$text
text = removePunctuation(text) # remove punctuation marks
text = removeNumbers(text) # remove numbers
text = stripWhitespace(text) # remove blank space
cor = Corpus(VectorSource(text)) # Create text corpus
dtm = DocumentTermMatrix(cor, # Craete DTM
control = list(weighting =
function(x)
weightTfIdf(x, normalize = F))) # IDF weighing
training_codes = train.data$classification # Coded labels
dim(dtm)
## [1] 2847 2916
The next code snippet will perform the following task:
Try changing the pre-processing a few times - dropping most common and uninformative words using the stopwords filter, for instance. Does it affect prediction accuracy?
# Step 4- Test the models and choose best model
container <- create_container(dtm, # creates a 'container' obj for training, classifying, and analyzing docs
t(training_codes), # labels or the Y variable / outcome we want to train on
trainSize = 1:nrow(train),
testSize = (nrow(train)+1):nrow(train.data),
virgin = FALSE) # whether to treat the classification data as 'virgin' data or not.
# if virgin = TRUE, then machine won;t borrow from prior datasets.
str(container) # view struc of the container obj; is a list of training n test data
## Formal class 'matrix_container' [package "RTextTools"] with 6 slots
## ..@ training_matrix :Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:21447] 4.09 6.43 7.31 3.09 7.57 ...
## .. .. ..@ ja : int [1:21447] 148 278 811 976 1138 1176 1529 1820 1857 2009 ...
## .. .. ..@ ia : int [1:1994] 1 18 27 39 50 58 72 83 88 102 ...
## .. .. ..@ dimension: int [1:2] 1993 2916
## ..@ classification_matrix:Formal class 'matrix.csr' [package "SparseM"] with 4 slots
## .. .. ..@ ra : num [1:9116] 9.89 4.26 9.89 7.77 9.89 ...
## .. .. ..@ ja : int [1:9116] 39 83 131 183 617 764 1110 1816 1820 1909 ...
## .. .. ..@ ia : int [1:855] 1 20 28 36 49 66 74 91 100 111 ...
## .. .. ..@ dimension: int [1:2] 854 2916
## ..@ training_codes : Factor w/ 3 levels "-1","0","1": 1 3 2 3 1 3 1 2 1 1 ...
## ..@ testing_codes : Factor w/ 3 levels "-1","0","1": 3 2 1 1 3 3 1 1 2 3 ...
## ..@ column_names : chr [1:2916] "aadmi" "aaj" "aam" "aamir" ...
## ..@ virgin : logi FALSE
models <- train_models(container, # ?train_models; makes a model object using the specified algorithms.
algorithms=c("MAXENT")) #"MAXENT","SVM","GLMNET","SLDA","TREE","BAGGING","BOOSTING","RF"
results <- classify_models(container, models)
head(results)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 1 1
## 2 0 1
## 3 -1 1
## 4 -1 1
## 5 1 1
## 6 1 1
names(train.data)
## [1] "id" "text" "classification"
# building a confusion matrix to see accuracy of prediction results
out = data.frame(model_sentiment = results$MAXENTROPY_LABEL, # rounded probability == model's prediction of Y
model_prob = results$MAXENTROPY_PROB,
actual_sentiment = train.data$classification[(nrow(train)+1):nrow(train.data)]) # actual value of Y
dim(out); head(out);
## [1] 854 3
## model_sentiment model_prob actual_sentiment
## 1 1 1 1
## 2 0 1 0
## 3 -1 1 -1
## 4 -1 1 -1
## 5 1 1 1
## 6 1 1 1
summary(out) # how many 0s and 1s were there anyway?
## model_sentiment model_prob actual_sentiment
## -1:338 Min. :0.5001 Min. :-1.00000
## 0 :183 1st Qu.:1.0000 1st Qu.:-1.00000
## 1 :333 Median :1.0000 Median : 0.00000
## Mean :0.9818 Mean :-0.03279
## 3rd Qu.:1.0000 3rd Qu.: 1.00000
## Max. :1.0000 Max. : 1.00000
(z = as.matrix(table(out[,1], out[,3]))) # display the confusion matrix.
##
## -1 0 1
## -1 329 4 5
## 0 5 175 3
## 1 16 3 314
(pct = round(((z[1,1] + z[2,2])/sum(z))*100, 2)) # prediction accuracy in % terms
## [1] 59.02
http://rpubs.com/neerajkhattar/Classifytweets
By adding more and more record, we have learned the machine give more accuracy. This is how we can let learn from the data. With the less number of records, we are not getting even 30% of accuracy. Then we increased the records and finally we are seeing more than 59 prediction accuracy.
We have picked up the following hashtag. These keyword are like buzz word in today’s hot words.
We choose the following six words to complete this task
Our corpus contain {2847 2916} number of documents. and as said above we got 59.02% accuracy. Also, our confusion matrix is as follow:
-1 0 1
-1 329 4 5
0 5 175 3
1 16 3 314
Learning: Classification should be done thorougly else machine won’t learn and will predict well. The accuracy in that case will be less than 50%. drop the uninformative words, will shot up the prediction accuracy.