The dataset is sourced from Kaggle and the main focus of this project is to learn various NLP techniques. The user would be able to accurately predict how the movie is trending among the public by reviewing and classifying each review. This will help users to make strategic decisions in their personal or business life.
So, The above two points suggest that by classifying movie reviews, we can provide a machine learning model extra information required to recommend the movie to the users.
pacman::p_load(e1071, ggplot2, caret, corrplot,dplyr,tm,wordcloud,RColorBrewer,tm.plugin.webmining,SnowballC,ngram,stringr,lsa,koRpus,textdata,
gutenbergr,tidytext,textstem,ggplot2,widyr,igraph,ggraph,forcats,tidyr,janitor,lubridate,reshape2,
tidyverse,foreign,nnet,neuralnet,arm,ROCR,pROC,party,rpart,doParallel,xgboost,imager,magrittr,rattle,
plotly,adabag)
theme_set(theme_classic())
options(digits = 3)In the dataset, each scentence is divided into various phrases of varying length. Some phrase contains only one word.
From below statistics, we can infer that:
## [1] "Average count of phrases per sentence in the train dataset is: 18"
## [1] "Average count of phrases per sentence in the test dataset is: 20"
## [1] "Number of phrases in the train dataset: 156060"
## [1] "Number of phrases in the test dataset: 66292"
## [1] "Number of sentences in the train dataset: 8529"
## [1] "Number of sentences in the test dataset: 3310"
## [1] "Average word length of phrases in the train dataset is: 7"
## [1] "Average word length of phrases in the test dataset is: 7"
The description of Target Variable:
Below we can see the distribution of sentiment (target variable) within the training dataset. It shows that most of the phrases received 2 sentiment value (average) and the very few received 1 and 4 rating with 0 being the lowest in frequency.
sentimentValue <- aggregate(train$PhraseId,by=list(train$Sentiment), FUN=length)$Group.1
sentimentcount <- aggregate(train$PhraseId,by=list(train$Sentiment), FUN=length)$x
barplot(sentimentcount, las = 2,
names.arg = sentimentValue,
col ="330068", main ="Count of Movie Ratings in Training Data Set",
ylab = "Word frequencies")We are making Corpus (dictionary of words) from all the phrases, then removing unnecessary text like stopwords, numbers,pronunciation, whitespaces which does not provides any meaning. Also, we have performed lemmenting to derive meaningful root of the words.
train.corpus <- VCorpus(VectorSource(train$Phrase))
train.corpus = tm_map(train.corpus, content_transformer(tolower))
train.corpus = tm_map(train.corpus, removeNumbers)
train.corpus = tm_map(train.corpus, removePunctuation)
train.corpus = tm_map(train.corpus, removeWords, stopwords())
train.corpus = tm_map(train.corpus, stripWhitespace)
train.corpusw = tm_map(train.corpus, lemmatize_strings)
train.corpusw = tm_map(train.corpusw, PlainTextDocument)Now, We have the clean text from phrases which needs to be converted into number vectors using word count per sentence
Creating Dataframe for WordCloud
The Word Cloud shows that film, movie, like, one, character, make, story, good, time, not, one, see, comedy, plot, work, funny have been used most in the movie reviews. Here the word ‘not’ suggest that we have to perform bigram in order to perform any analysis
pal2 <- brewer.pal(8,"Dark2")
suppressWarnings(wordcloud(d$word, colors=pal2,random.order=FALSE, d$freq, min.freq=80,scale=c(8,.2),max.words=47,rot.per=.15)
)Let’s look at most frequent to understand which words are used by people to provide movie reviews are influential in the sentiment analysis.
barplot(d[1:10,]$freq, las = 2,
names.arg = d[1:10,]$word,
col ="330066", main ="Most frequent words",
ylab = "Word frequencies")Let’s look at least words to understand which words are used by people to provide movie reviews and which might not be influential in the sentiment analysis. To be sure, we need to perform TF-IDF which will provide weights to each word which might make less occurring words important for analysis
barplot(d[nrow(d):(nrow(d)-10),]$freq, las = 2,
names.arg = d[nrow(d):(nrow(d)-10),]$word,
col ="green", main ="Least frequent words",
ylab = "Word frequencies")Removing NA’s and converting target variable to factors
dataset$Sentiment = train$Sentiment
dataset$Sentiment = factor(dataset$Sentiment, levels = c(0,1,2,3,4))
df <- data.frame(text=unlist(sapply(train.corpus, `[`, "content")), stringsAsFactors=FALSE)
df$Sentiment = train$Sentiment
##df$Sentiment = factor(df$Sentiment, levels = c(0,1,2,3,4))
val=0
for (i in 1:nrow(df))
{
if (df$text[i]== "")
{
val=c(val,gsub("",NA,df$text[i]))
}else{
val=c(val,lemmatize_strings(df$text[i]))
}
}
val <- val[2:length(val)]
df$text <- val
df <- df[!duplicated(df), ]
df <- na.omit(df)
df <- df[!apply(is.na(df) | df$text == "", 1, all),]Tokenizing the Words
Below graph suugets that word s has come lot of time. S is not a word and needs to be eliminated from dataframe
tidy_book %>%
count(word, sort = TRUE) %>%
top_n(20) %>%
ggplot(aes(fct_reorder(word, n), n)) +
geom_col(fill="gold", width = 0.6) +
coord_flip() +
labs(y="Frequency", x = NULL)## Selecting by n
##Sentiment lexicons After exploring sentiment lexicons using bing, we deduce that we have 47044 positive and 47753 negative words
## Joining, by = "word"
Below are some most postive and negative words within the training dataset that are contributing the most to the sentiment scores.
tidy_book %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, word, sort = TRUE) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup %>%
ggplot(aes(fct_reorder(word, n),
n,
fill = sentiment)) +
geom_col() +
coord_flip() +
facet_wrap(~ sentiment, scales = "free") +
labs(y= "Contribution to Sentiment", x = NULL)## Joining, by = "word"
## Selecting by n
Implementing Bigram
Below are some most common bigrams
bigram_counts <- tidy_ngram %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
bigram_countsCreating a word network from bigrams
## IGRAPH de3e901 DN-- 64 43 --
## + attr: name (v/c), n (e/n)
## + edges from de3e901 (vertex names):
## [1] romantic ->comedy lrb ->rrb love ->story
## [4] spin ->dry subject ->matter special ->effect
## [7] soap ->opus run ->time bad ->movie
## [10] action ->film rrb ->lrb de ->niro
## [13] sense ->humor action ->sequence horror ->movie
## [16] world ->war action ->movie horror ->film
## [19] target ->audience war ->ii war ->movie
## [22] character->study motion ->picture adam ->sandler
## + ... omitted several edges
After visualizing Bigram Network, we found that closely occurring two words. Below words are occurring more than 40 times within all the phrases within the dataset. Below Network suggests that bigram will be best suited for sentiment analysis.
bigram_graph %>%
ggraph(layout = "nicely") +
geom_node_point(size = 6, color = "khaki") +
geom_edge_link(aes(edge_alpha = n),
show.legend = FALSE,
arrow = arrow(length = unit(1.5, 'mm')),
start_cap = circle(3, 'mm'),
end_cap = circle(3, 'mm')) +
geom_node_text(aes(label = name),
color = "navy") +
theme_graph() We have performed latent semantic analysis(LSA) transformation to convert our words into 50 transformed variables
df1 <- df[1:1000,]
tidy_ngram1 <- df1 %>%
unnest_tokens(bigram, text, token = "ngrams", n=2)
train.corpus_ngram <- VCorpus(VectorSource(tidy_ngram1$bigram))
dtm_ngram = DocumentTermMatrix(train.corpus_ngram)
tfidf <- weightTfIdf(dtm_ngram)
txt_mat<- as.textmatrix(as.matrix(tfidf))
lsa_model <- lsa(txt_mat,dim=50)
words.df <- as.data.frame(as.matrix(lsa_model$tk))
tidy_ngram1$Sentiment = as.factor(tidy_ngram1$Sentiment)
trainData <- cbind(label = tidy_ngram1$Sentiment, words.df)The accuacy for Ljnear SVM is 50.3%.
set.seed(13)
pet_CV_Folds <- createMultiFolds(trainData$label, k = 5, times=1)
cl <- parallel::makeCluster(detectCores(logical=TRUE)-1, type='PSOCK')
doParallel::registerDoParallel(cl)
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
grid <- expand.grid(C=seq(0.8,1.2,0.2))
set.seed(13)
Linear_SVM <- caret::train(label ~., data = trainData,method="svmLinear",
trControl=trnControl,tuneGrid = grid)## Aggregating results
## Selecting tuning parameters
## Fitting C = 1.2 on full training set
Linear_SVM_pred <- predict(Linear_SVM, trainData)
confusionMatrix(table(Linear_SVM_pred,trainData$label))## Confusion Matrix and Statistics
##
##
## Linear_SVM_pred 0 1 2 3 4
## 0 0 0 0 0 0
## 1 30 146 88 23 4
## 2 90 499 1389 572 194
## 3 2 12 19 43 22
## 4 0 8 15 6 27
##
## Overall Statistics
##
## Accuracy : 0.503
## 95% CI : (0.486, 0.521)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : 0.00046
##
## Kappa : 0.122
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.0000 0.2195 0.919 0.0668 0.10931
## Specificity 1.0000 0.9426 0.192 0.9784 0.99014
## Pos Pred Value NaN 0.5017 0.506 0.4388 0.48214
## Neg Pred Value 0.9617 0.8209 0.726 0.8056 0.92978
## Prevalence 0.0383 0.2085 0.474 0.2019 0.07745
## Detection Rate 0.0000 0.0458 0.436 0.0135 0.00847
## Detection Prevalence 0.0000 0.0913 0.860 0.0307 0.01756
## Balanced Accuracy 0.5000 0.5811 0.556 0.5226 0.54973
The accuacy for Radial SVM is 49.5%.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=TRUE)-1, type='PSOCK')
doParallel::registerDoParallel(cl)
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
grid <- expand.grid(C=1,sigma = 0.01)
set.seed(13)
Radial_SVM <- caret::train(label ~., data = trainData,method="svmRadial",
trControl=trnControl,tuneGrid = grid)## Aggregating results
## Fitting final model on full training set
Radial_SVM_pred <- predict(Radial_SVM, trainData)
confusionMatrix(table(Radial_SVM_pred,trainData$label))## Confusion Matrix and Statistics
##
##
## Radial_SVM_pred 0 1 2 3 4
## 0 0 0 0 0 0
## 1 39 178 95 30 4
## 2 82 479 1407 574 207
## 3 1 8 7 40 16
## 4 0 0 2 0 20
##
## Overall Statistics
##
## Accuracy : 0.516
## 95% CI : (0.498, 0.533)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : 1.12e-06
##
## Kappa : 0.141
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.0000 0.2677 0.931 0.0621 0.08097
## Specificity 1.0000 0.9334 0.200 0.9874 0.99932
## Pos Pred Value NaN 0.5145 0.512 0.5556 0.90909
## Neg Pred Value 0.9617 0.8287 0.764 0.8062 0.92832
## Prevalence 0.0383 0.2085 0.474 0.2019 0.07745
## Detection Rate 0.0000 0.0558 0.441 0.0125 0.00627
## Detection Prevalence 0.0000 0.1085 0.862 0.0226 0.00690
## Balanced Accuracy 0.5000 0.6006 0.566 0.5248 0.54015
The accuacy for Decision Tree is 49.4%.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
tree <- caret::train(label ~., data = trainData, method = "rpart",
parms = list(split = "information"),
trControl=trnControl)## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00511 on full training set
## Confusion Matrix and Statistics
##
##
## tree_pred 0 1 2 3 4
## 0 0 0 0 0 0
## 1 53 105 86 23 0
## 2 69 559 1423 589 216
## 3 0 0 2 30 14
## 4 0 1 0 2 17
##
## Overall Statistics
##
## Accuracy : 0.494
## 95% CI : (0.476, 0.511)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : 0.0122
##
## Kappa : 0.088
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.0000 0.1579 0.942 0.04658 0.06883
## Specificity 1.0000 0.9358 0.146 0.99371 0.99898
## Pos Pred Value NaN 0.3933 0.498 0.65217 0.85000
## Neg Pred Value 0.9617 0.8084 0.736 0.80465 0.92742
## Prevalence 0.0383 0.2085 0.474 0.20194 0.07745
## Detection Rate 0.0000 0.0329 0.446 0.00941 0.00533
## Detection Prevalence 0.0000 0.0837 0.896 0.01442 0.00627
## Balanced Accuracy 0.5000 0.5469 0.544 0.52015 0.53390
Since, we have LSA transformed variables, we can’t deduce which words are more important to establish decision tree.
The accuracy for XGboost Decision Tree is 58.4%. After XGboost boosting algorithm, decision tree accuracy has improved little but still is not good at classifying reviews.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
XGboost_DT <- caret::train(label ~., data = trainData, method = "xgbTree",
parms = list(split = "information"),
trControl=trnControl)## Aggregating results
## Selecting tuning parameters
## Fitting nrounds = 50, max_depth = 2, eta = 0.3, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.75 on full training set
XGboost_DT_pred <- predict(XGboost_DT, trainData)
confusionMatrix(table(XGboost_DT_pred,trainData$label))## Confusion Matrix and Statistics
##
##
## XGboost_DT_pred 0 1 2 3 4
## 0 15 6 7 1 0
## 1 54 291 136 38 8
## 2 52 349 1277 392 88
## 3 1 13 61 180 53
## 4 0 6 30 33 98
##
## Overall Statistics
##
## Accuracy : 0.584
## 95% CI : (0.566, 0.601)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.329
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.12295 0.4376 0.845 0.2795 0.3968
## Specificity 0.99544 0.9065 0.475 0.9497 0.9765
## Pos Pred Value 0.51724 0.5522 0.592 0.5844 0.5868
## Neg Pred Value 0.96614 0.8595 0.773 0.8389 0.9507
## Prevalence 0.03826 0.2085 0.474 0.2019 0.0775
## Detection Rate 0.00470 0.0913 0.400 0.0564 0.0307
## Detection Prevalence 0.00909 0.1653 0.677 0.0966 0.0524
## Balanced Accuracy 0.55919 0.6720 0.660 0.6146 0.6867
The accuracy for Random Forest is 76.5%. Random Forest’s accuracy has improved a lot at classifying reviews.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
rf_tree <- caret::train(label ~., data = trainData, method = "rf",
parms = list(split = "information"),
trControl=trnControl)## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 26 on full training set
## Confusion Matrix and Statistics
##
##
## rf_tree_pred 0 1 2 3 4
## 0 122 0 0 0 0
## 1 0 557 120 11 1
## 2 0 97 1322 302 58
## 3 0 9 56 307 58
## 4 0 2 13 24 130
##
## Overall Statistics
##
## Accuracy : 0.765
## 95% CI : (0.749, 0.779)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.642
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 1.0000 0.838 0.875 0.4767 0.5263
## Specificity 1.0000 0.948 0.728 0.9517 0.9867
## Pos Pred Value 1.0000 0.808 0.743 0.7140 0.7692
## Neg Pred Value 1.0000 0.957 0.866 0.8779 0.9613
## Prevalence 0.0383 0.209 0.474 0.2019 0.0775
## Detection Rate 0.0383 0.175 0.415 0.0963 0.0408
## Detection Prevalence 0.0383 0.216 0.558 0.1348 0.0530
## Balanced Accuracy 1.0000 0.893 0.801 0.7142 0.7565
The accuracy for XGboost Decision Tree is 49.6%. After Adaboost boosting algorithm, decision tree accuracy is almost same.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
Adaboost_DT <- caret::train(label ~., data = trainData, method = "AdaBag",
parms = list(split = "information"),
trControl=trnControl)## Aggregating results
## Selecting tuning parameters
## Fitting mfinal = 50, maxdepth = 3 on full training set
Adaboost_DT_pred <- predict(Adaboost_DT, trainData)
confusionMatrix(table(Adaboost_DT_pred,trainData$label))## Confusion Matrix and Statistics
##
##
## Adaboost_DT_pred 0 1 2 3 4
## 0 0 0 0 0 0
## 1 45 179 107 28 6
## 2 77 486 1404 616 241
## 3 0 0 0 0 0
## 4 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.496
## 95% CI : (0.479, 0.514)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : 0.00563
##
## Kappa : 0.095
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.0000 0.2692 0.929 0.000 0.0000
## Specificity 1.0000 0.9263 0.154 1.000 1.0000
## Pos Pred Value NaN 0.4904 0.497 NaN NaN
## Neg Pred Value 0.9617 0.8279 0.707 0.798 0.9225
## Prevalence 0.0383 0.2085 0.474 0.202 0.0775
## Detection Rate 0.0000 0.0561 0.440 0.000 0.0000
## Detection Prevalence 0.0000 0.1145 0.886 0.000 0.0000
## Balanced Accuracy 0.5000 0.5977 0.541 0.500 0.5000
The accuracy for BoostStrap Decision Tree Model is 75.8%. After boostStraping, decision Tree’s accuracy has improved a lot at classifying reviews.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
dtree_reg <- caret::train(label ~., data = trainData, method = "treebag",
parms = list(split = "information"),trControl=trnControl)## Aggregating results
## Fitting final model on full training set
dtree_reg_pred <- predict(dtree_reg, trainData)
confusionMatrix(table(dtree_reg_pred,trainData$label))## Confusion Matrix and Statistics
##
##
## dtree_reg_pred 0 1 2 3 4
## 0 115 0 0 0 0
## 1 3 533 105 11 0
## 2 4 117 1307 276 55
## 3 0 13 83 336 66
## 4 0 2 16 21 126
##
## Overall Statistics
##
## Accuracy : 0.758
## 95% CI : (0.743, 0.773)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.633
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.9426 0.802 0.865 0.522 0.5101
## Specificity 1.0000 0.953 0.731 0.936 0.9867
## Pos Pred Value 1.0000 0.817 0.743 0.675 0.7636
## Neg Pred Value 0.9977 0.948 0.857 0.886 0.9600
## Prevalence 0.0383 0.209 0.474 0.202 0.0775
## Detection Rate 0.0361 0.167 0.410 0.105 0.0395
## Detection Prevalence 0.0361 0.204 0.552 0.156 0.0517
## Balanced Accuracy 0.9713 0.877 0.798 0.729 0.7484
The accuacy for Neural Net is 50.6%.
parallel::stopCluster(cl)
registerDoSEQ()
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
tune.gridcart <- expand.grid(maxdepth = seq(1,10,1))
trnControl <- trainControl(method='cv',index=pet_CV_Folds,
allowParallel = TRUE,verboseIter=TRUE)
set.seed(13)
Nnet_reg <- caret::train(label ~., data = trainData, method = "nnet",trControl=trnControl)## Aggregating results
## Selecting tuning parameters
## Fitting size = 3, decay = 0 on full training set
## # weights: 173
## initial value 5867.332857
## iter 10 value 4240.305889
## iter 20 value 4194.626796
## iter 30 value 3973.247208
## iter 40 value 3771.903294
## iter 50 value 3629.095470
## iter 60 value 3584.016068
## iter 70 value 3560.778177
## iter 80 value 3549.414945
## iter 90 value 3543.091799
## iter 100 value 3535.772916
## final value 3535.772916
## stopped after 100 iterations
## Confusion Matrix and Statistics
##
##
## Nnet_reg_pred 0 1 2 3 4
## 0 9 8 10 3 0
## 1 30 127 90 10 0
## 2 78 515 1358 544 173
## 3 5 9 37 68 23
## 4 0 6 16 19 51
##
## Overall Statistics
##
## Accuracy : 0.506
## 95% CI : (0.488, 0.523)
## No Information Rate : 0.474
## P-Value [Acc > NIR] : 0.000161
##
## Kappa : 0.141
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4
## Sensitivity 0.07377 0.1910 0.899 0.1056 0.2065
## Specificity 0.99315 0.9485 0.219 0.9709 0.9861
## Pos Pred Value 0.30000 0.4942 0.509 0.4789 0.5543
## Neg Pred Value 0.96423 0.8165 0.706 0.8110 0.9367
## Prevalence 0.03826 0.2085 0.474 0.2019 0.0775
## Detection Rate 0.00282 0.0398 0.426 0.0213 0.0160
## Detection Prevalence 0.00941 0.0806 0.837 0.0445 0.0288
## Balanced Accuracy 0.53346 0.5697 0.559 0.5383 0.5963
The area under curve for Random Forest is highest and outperforms all other algorithms
dtree_roc_obj <- roc(trainData$label, as.numeric(dtree_reg_pred))
Linear_SVM_roc_obj <- roc(trainData$label, as.numeric(Linear_SVM_pred))
Radial_SVM_roc_obj_roc_obj <- roc(trainData$label, as.numeric(Radial_SVM_pred))
XGboost_roc_obj <- roc(trainData$label, as.numeric(XGboost_DT_pred))
Nnet_roc_obj <- roc(trainData$label, as.numeric(Nnet_reg_pred))
Rf_roc_obj <- roc(trainData$label, as.numeric(rf_tree_pred))
tree_roc_obj <- roc(trainData$label, as.numeric(tree_pred))
Adaboost_roc_obj <- roc(trainData$label, as.numeric(Adaboost_DT_pred))
plot(dtree_roc_obj, print.auc=TRUE,col="red",main="Decision Tree")Each colour represents a fold. The line gives us the information of what accuracy the model is giving us when the same fold is fed into it.
Extreme Boosting(xgbTree) is giving us significantly better accuracy for every fold.
movie_comp <- resamples(list(svmLinear = Linear_SVM,
svmRadial = Radial_SVM,
BoostedDecisionTree = dtree_reg,
XGBoostedTree = XGboost_DT,
NeralNetwork = Nnet_reg,
DecisionTree = tree,
AdaboostTree = Adaboost_DT,
RandomForest = rf_tree))
bwplot(movie_comp, metric = "Accuracy",main='Models vs Accuracy Boxplot')Time Comparison Each colour represents a model. The best model is going to be a balance between accuracy and time taken ie. the one that takes less time and also gives us good accuracy.
Extreme Boosting(xgbTree) is giving us significantly better accuracy and take few seconds to give us the result
xyplot(movie_comp, what = "mTime",units = "min",
main='Movie Reviews : ModelTime Plot for all models(k=5 folds)',
auto.key=list(space='left', row=1,
title='Model', cex.title=1.5,
lines=TRUE, points=FALSE))This is the learning curve for Training, Cross Validation Accuracy vs no of Training Examples By looking at the plot, we can say that our model is to simple and we need more features for it perform better because the training accuracy is being pulled down and testing accuracy is not increasing. We need to perform some more analysis before running models
trainData_curve <- as.data.frame(lapply(trainData, as.numeric))
trainData_curve$label <- as.factor(trainData_curve$label)
cl <- parallel::makeCluster(detectCores(logical=FALSE), type='PSOCK')
doParallel::registerDoParallel(cl)
trnControl <- trainControl(method='cv',index=pet_CV_Folds, allowParallel = TRUE)
besttune_model = expand.grid(.mtry=c(1,5,10,15))
set.seed(13)
learning_curve <- learning_curve_dat(
dat = trainData_curve,
outcome = "label",test_prop = 0,
verbose = TRUE, method = "rf",
metric = "metric",tuneGrid = besttune_model)## Training for 10% (n = 318)
## Training for 20% (n = 637)
## Training for 30% (n = 956)
## Training for 40% (n = 1275)
## Training for 50% (n = 1594)
## Training for 60% (n = 1913)
## Training for 70% (n = 2232)
## Training for 80% (n = 2551)
## Training for 90% (n = 2870)
## Training for 100% (n = 3189)
parallel::stopCluster(cl)
registerDoSEQ()
Sampling_plot<- ggplot(learning_curve, aes(x = Training_Size, y =Accuracy, color = Data)) +
geom_smooth(method = loess, span = .8) +
theme(legend.position="top")+
labs(title = "Pet boostTree : Accuracy(Train & Test) vs m")
Sampling_plotOur Model is ready and now preparing test data so that we can predict sentiments for phrases in test dataset.
test.corpus <- VCorpus(VectorSource(test$Phrase))
test.corpus = tm_map(test.corpus, content_transformer(tolower))
test.corpus = tm_map(test.corpus, removeNumbers)
test.corpus = tm_map(test.corpus, removePunctuation)
test.corpus = tm_map(test.corpus, removeWords, stopwords())
test.corpus = tm_map(test.corpus, stripWhitespace)df_test <- data.frame(text=unlist(sapply(test.corpus, `[`, "content")), stringsAsFactors=FALSE)
val=0
for (i in 1:nrow(df_test))
{
if (df_test$text[i]== "")
{
val=c(val,gsub("",NA,df_test$text[i]))
}else{
val=c(val,lemmatize_strings(df_test$text[i]))
}
}
val <- val[2:length(val)]
df_test$text1 <- val
df_test <- df_test[!duplicated(df_test), ]
df_test <- na.omit(df_test)
df_test <- df_test[!apply(is.na(df_test) | df_test$text1 == "", 1, all),]
df_test$text <- df_test$text1
df_test <- subset(df_test, select = -text1)##tidy_ngram_test <- df_test1 %>%
## unnest_tokens(bigram, text, token = "ngrams", n=2)
train.corpus_test <- VCorpus(VectorSource(df_test$text[1:10000]))
dtm_test = DocumentTermMatrix(train.corpus_test)
tfidf_test <- weightTfIdf(dtm_test)
txt_mat_test<- as.textmatrix(as.matrix(tfidf_test))
lsa_model_test <- lsa(txt_mat_test,dim=50)
testData <- as.data.frame(as.matrix(lsa_model_test$tk)) Below is the Test dataset for which prediction is made.
prediction <-predict(rf_tree,testData)
df_test1 <- as.data.frame(df_test[1:10000,])
df_test1$Sentiment <- prediction
head(df_test1,10)Random Forest is the best model even though the time consumed by it is little higher than others.