library(tm)
## Loading required package: NLP
library(SnowballC)
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
tr <- read.csv('train.csv', header = TRUE, stringsAsFactors = T)
tr_tweet <- read.csv('train_tweets.csv', header = TRUE, stringsAsFactors = F)
te <- read.csv('test_tweets.csv', header = TRUE, stringsAsFactors = F)
summary(tr)
## ID Sentiment
## Min. : 3 negative:17304
## 1st Qu.:12548 positive:17696
## Median :25035
## Mean :25052
## 3rd Qu.:37561
## Max. :50000
summary(tr_tweet)
## ID Tweet
## Min. : 3 Length:35000
## 1st Qu.:12548 Class :character
## Median :25035 Mode :character
## Mean :25052
## 3rd Qu.:37561
## Max. :50000
summary(te)
## ID Tweet
## Min. : 1 Length:15000
## 1st Qu.:12383 Class :character
## Median :24900 Mode :character
## Mean :24880
## 3rd Qu.:37322
## Max. :49993
joining thr train and test tweets
tr_t <- tr_tweet$Tweet
r1 <- data.frame('t' = tr_tweet$Tweet)
r2 <- data.frame('t' = te$Tweet)
da <- rbind(r1,r2)
Text Preprocession
removeURL <- function(x) gsub("(f|ht)(tp)(s?)(://)(\\S*)", "", x)
da$t <- gsub("@\\w+", "", da$t)#removing the twitter handle
da$t <- gsub("#\\w+", "", da$t)#removing hashtag
da$t <- gsub("&\\w+", "", da$t)#removing html characters
da$t <- gsub("([[:alpha:]])\\1{2,}", "\\1",da$t)#removing multiple characters
da$t <- removeURL(da$t)
Building a Corpus and removing the punctuation,stopwords,white space,
#here we are converting the data into corpus
doc <- VCorpus(VectorSource(da$t))
doc <- tm_map(doc, tolower) #change it to lower case
doc <- tm_map(doc, removeNumbers) #removing words
doc <- tm_map(doc, removeWords, stopwords(kind = 'en'))#removing stopwords
doc <- tm_map(doc, removePunctuation) #we should not remove punctuations since its a tweet
doc <- tm_map(doc, stripWhitespace)#remove unwanted white spaces
doc <- tm_map(doc, stemDocument)#creating stem document
doc <- tm_map(doc, PlainTextDocument)#converting to plain text document
Creating the Document Term Matrix
dtm <- DocumentTermMatrix(doc)
dim(dtm)
## [1] 50000 27719
print(dtm)
## <<DocumentTermMatrix (documents: 50000, terms: 27719)>>
## Non-/sparse entries: 317751/1385632249
## Sparsity : 100%
## Maximal term length: 117
## Weighting : term frequency (tf)
dense_dtm <- removeSparseTerms(dtm, 0.995)
dim(dense_dtm)
## [1] 50000 223
tw_dtm <- as.data.frame(as.matrix(dense_dtm))
colnames(tw_dtm) <- make.names(colnames(tw_dtm))
train_dtm <- tw_dtm[1:35000,]
test_dtm <- tw_dtm[35001:50000,]
Splitting the Train to Train adn Validation set
#now lets predict the
set.seed(133)
id <- sample(35000,35000*.75)
t_dtm <- train_dtm[id,]
ty_ <- tr$Sentiment[id]
v_dtm <- train_dtm[-id,]
vy_ <- tr$Sentiment[-id]
Implementing Naive bayes
#Naive Bayes
library(e1071)
library(caret)
#naiveBayes
class_nb <- naiveBayes(x=t_dtm,y=ty_,laplace = 100, na.action)
class_pred <- predict(class_nb, t_dtm)#validation-0.6751 train-0.6827
tab <- table(class_pred, ty_)
confusionMatrix(tab)
## Confusion Matrix and Statistics
##
## ty_
## class_pred negative positive
## negative 7490 2882
## positive 5448 10430
##
## Accuracy : 0.6827
## 95% CI : (0.677, 0.6883)
## No Information Rate : 0.5071
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3634
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5789
## Specificity : 0.7835
## Pos Pred Value : 0.7221
## Neg Pred Value : 0.6569
## Prevalence : 0.4929
## Detection Rate : 0.2853
## Detection Prevalence : 0.3951
## Balanced Accuracy : 0.6812
##
## 'Positive' Class : negative
##
class_pred <- predict(class_nb,test_dtm)
#SVM Radial
# class_svm <- svm(x=t_dtm, y=ty_)
# class_svm
# class_pred <- predict(class_svm,t_dtm) #with validation-0.6491 train-0.7314
# tab <- table(class_pred,ty_)
# confusionMatrix(tab)
# class_pred <- predict(class_svm,test_dtm) public #0.642333
#randomforst
# library(randomForest)
# class_rf <- randomForest(x=t_dtm,y=ty_,ntree = 500,importance = T,mtry = 3)
# class_rf
# class_pred <- predict(class_rf, v_dtm)#validation-0.6551 train-0.6863 ##train-0.7286 validation-0.6817
# tab <- table(class_pred, vy_)
# confusionMatrix(tab)
# class_pred <- predict(class_rf,test_dtm)
# submission <- data.frame('ID'=te$ID,'Sentiment'=as.character(class_pred))
# filename <- paste('sentiment_analysis',format(Sys.time(),"%Y%m%d%H%M%s"),sep = '_')
# write.csv(submission,paste0(filename,'.csv',collapse = ''),row.names = FALSE)