library(tm)
## Loading required package: NLP
library(SnowballC)
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
tr <- read.csv('train.csv', header = TRUE, stringsAsFactors = T)
tr_tweet <- read.csv('train_tweets.csv', header = TRUE, stringsAsFactors = F)
te <- read.csv('test_tweets.csv', header = TRUE, stringsAsFactors = F)


summary(tr)
##        ID           Sentiment    
##  Min.   :    3   negative:17304  
##  1st Qu.:12548   positive:17696  
##  Median :25035                   
##  Mean   :25052                   
##  3rd Qu.:37561                   
##  Max.   :50000
summary(tr_tweet)
##        ID           Tweet          
##  Min.   :    3   Length:35000      
##  1st Qu.:12548   Class :character  
##  Median :25035   Mode  :character  
##  Mean   :25052                     
##  3rd Qu.:37561                     
##  Max.   :50000
summary(te)
##        ID           Tweet          
##  Min.   :    1   Length:15000      
##  1st Qu.:12383   Class :character  
##  Median :24900   Mode  :character  
##  Mean   :24880                     
##  3rd Qu.:37322                     
##  Max.   :49993

joining thr train and test tweets

tr_t <- tr_tweet$Tweet

r1 <- data.frame('t' = tr_tweet$Tweet)
r2 <- data.frame('t' = te$Tweet)
da <- rbind(r1,r2)

Text Preprocession

removeURL <- function(x)  gsub("(f|ht)(tp)(s?)(://)(\\S*)", "", x)
da$t <- gsub("@\\w+", "", da$t)#removing the twitter handle
da$t <- gsub("#\\w+", "", da$t)#removing hashtag
da$t <- gsub("&\\w+", "", da$t)#removing html characters
da$t <- gsub("([[:alpha:]])\\1{2,}", "\\1",da$t)#removing multiple characters
da$t <- removeURL(da$t)

Building a Corpus and removing the punctuation,stopwords,white space,

#here we are converting the data into corpus
doc <- VCorpus(VectorSource(da$t))

doc <- tm_map(doc, tolower) #change it to lower case
doc <- tm_map(doc, removeNumbers) #removing words
doc <- tm_map(doc, removeWords, stopwords(kind = 'en'))#removing stopwords
doc <- tm_map(doc, removePunctuation) #we should not remove punctuations since its a tweet
doc <- tm_map(doc, stripWhitespace)#remove unwanted white spaces
doc <- tm_map(doc, stemDocument)#creating stem document

doc <- tm_map(doc, PlainTextDocument)#converting to plain text document

Creating the Document Term Matrix

dtm <- DocumentTermMatrix(doc)
dim(dtm)
## [1] 50000 27719
print(dtm)
## <<DocumentTermMatrix (documents: 50000, terms: 27719)>>
## Non-/sparse entries: 317751/1385632249
## Sparsity           : 100%
## Maximal term length: 117
## Weighting          : term frequency (tf)
dense_dtm <- removeSparseTerms(dtm, 0.995)
dim(dense_dtm)
## [1] 50000   223
tw_dtm <- as.data.frame(as.matrix(dense_dtm))
colnames(tw_dtm) <- make.names(colnames(tw_dtm))

train_dtm <- tw_dtm[1:35000,]
test_dtm <- tw_dtm[35001:50000,]

Splitting the Train to Train adn Validation set

#now lets predict the 
set.seed(133)
id <- sample(35000,35000*.75)
t_dtm <- train_dtm[id,]
ty_ <- tr$Sentiment[id]
v_dtm <- train_dtm[-id,]
vy_ <- tr$Sentiment[-id]

Implementing Naive bayes

#Naive Bayes
library(e1071)
library(caret)

#naiveBayes
class_nb <- naiveBayes(x=t_dtm,y=ty_,laplace = 100, na.action)
class_pred <- predict(class_nb, t_dtm)#validation-0.6751 train-0.6827
tab <- table(class_pred, ty_)
confusionMatrix(tab)
## Confusion Matrix and Statistics
## 
##           ty_
## class_pred negative positive
##   negative     7490     2882
##   positive     5448    10430
##                                          
##                Accuracy : 0.6827         
##                  95% CI : (0.677, 0.6883)
##     No Information Rate : 0.5071         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.3634         
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5789         
##             Specificity : 0.7835         
##          Pos Pred Value : 0.7221         
##          Neg Pred Value : 0.6569         
##              Prevalence : 0.4929         
##          Detection Rate : 0.2853         
##    Detection Prevalence : 0.3951         
##       Balanced Accuracy : 0.6812         
##                                          
##        'Positive' Class : negative       
## 
class_pred <- predict(class_nb,test_dtm)
#SVM Radial
# class_svm <- svm(x=t_dtm, y=ty_)
# class_svm
# class_pred <- predict(class_svm,t_dtm) #with validation-0.6491 train-0.7314
# tab <- table(class_pred,ty_)
# confusionMatrix(tab)
# class_pred <- predict(class_svm,test_dtm)  public #0.642333

#randomforst
# library(randomForest)
# class_rf <- randomForest(x=t_dtm,y=ty_,ntree = 500,importance = T,mtry = 3)
# class_rf
# class_pred <- predict(class_rf, v_dtm)#validation-0.6551 train-0.6863 ##train-0.7286 validation-0.6817
# tab <- table(class_pred, vy_)
# confusionMatrix(tab) 
# class_pred <- predict(class_rf,test_dtm)
# submission <- data.frame('ID'=te$ID,'Sentiment'=as.character(class_pred))
# filename <- paste('sentiment_analysis',format(Sys.time(),"%Y%m%d%H%M%s"),sep = '_')
# write.csv(submission,paste0(filename,'.csv',collapse = ''),row.names = FALSE)