Importing libraries

library(readr)
library(stringr)
library(SnowballC)
library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'RTextTools'

## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

library(tm)

## Loading required package: NLP

library(SnowballC)
library(tau)

## 
## Attaching package: 'tau'

## The following object is masked from 'package:readr':
## 
##     tokenize

Importing the data

Data <- read.csv("~/NLP/final_data1_new.csv")
str(Data)

## 'data.frame':    184 obs. of  2 variables:
##  $ Message: Factor w/ 183 levels " He opened a new restaurant in a new city every year. ",..: 56 97 113 169 105 23 92 95 148 144 ...
##  $ label  : int  1 0 0 1 0 0 1 0 0 0 ...

colnames(Data)<-c("text","type")
Data$text<-as.character(Data$text)
Data$type<-as.factor(Data$type)
head(Data)

##                                                                      text
## 1                                            I love my new restaurant!!!!
## 2                                   My daughter loves her new restaurant!
## 3 Opened a new restaurant on the ocassion of Xmas and people loved it.   
## 4                             we love to invite you to our new Restaurant
## 5                         New Restaurant !!! Customer loving the ambience
## 6        Does anyone know the new Restaurant opened last week in Bristol?
##   type
## 1    1
## 2    0
## 3    0
## 4    1
## 5    0
## 6    0

Creating Corpus & cleaning it(we need the required stopwords so we exclude those words)

#Creating Corpus
corpusHS <- Corpus(VectorSource(Data$text))
corpusHS = tm_map(corpusHS, removeNumbers)
corpusHS = tm_map(corpusHS, str_replace_all, pattern="[[:punct:]]", replacement=" ")
exceptions   <- c("i","my","our")
my_stopwords <- setdiff(stopwords("en"), exceptions)
corpusHS = tm_map(corpusHS, removeWords, words=my_stopwords)
corpusHS = tm_map(corpusHS, tolower)
#corpusHS = tm_map(corpusHS, stemDocument)
corpusHS = tm_map(corpusHS, PlainTextDocument)

corpusHS[[25]]$content

## [1] "people  loving my new  star restaurant     luxury   star   expense   star  "

corpusHS[[1]]$content

## [1] "i love my new restaurant    "

Data[25,1]

## [1] "people are loving my new 3 star restaurant too as it is luxury of 5 star at the expense of 3 star. "

Data[1,1]

## [1] "I love my new restaurant!!!!"

N-gram(1,2,3)

DTM

BigramTokenizer <-
  function(x)
    unlist(lapply(ngrams(words(x), c(1,2,3)), paste, collapse = " "), use.names = FALSE)

dtm <- DocumentTermMatrix(corpusHS, control = list(tokenize = BigramTokenizer))
#dtm <-  DocumentTermMatrix(corpusHS, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))

Training and Testing

#Training and Testing
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

set.seed(100)
inTrain1<-createDataPartition(Data$type,p=0.60,list=F)
dtm_train<-dtm[inTrain1,]
dtm_test<-dtm[-inTrain1,]
nrow(dtm_train)

## [1] 111

nrow(dtm_test)

## [1] 73

Data_train_labels <- Data[inTrain1, ]$type
Data_test_labels  <- Data[-inTrain1, ]$type

Removing Sparse Terms

#http://stats.stackexchange.com/questions/160539/is-this-interpretation-of-sparsity-accurate
#if 0.9999 then words with 0 count also present.
#if 0.01 then words with high count(that word present in each doc)
Data_dtm_freq_train <- removeSparseTerms(dtm_train, 0.999)
Data_dtm_freq_train

## <<DocumentTermMatrix (documents: 111, terms: 1876)>>
## Non-/sparse entries: 2458/205778
## Sparsity           : 99%
## Maximal term length: 34
## Weighting          : term frequency (tf)

Finding the Frequency words btwn the range

#findFreqTerms(dtm_train,2,30)
#findFreqTerms(x, lowfreq = 0, highfreq = Inf)
Data_freq_words <- findFreqTerms(dtm_train,2,30)
#?findAssocs()
findAssocs(dtm,"my new",0.3)

## $`my new`
## my new restaurant       love my new    new restaurant               new 
##              0.94              0.53              0.43              0.37 
##             i can           without 
##              0.30              0.30

findAssocs(dtm,"new restaurant",0.25)

## $`new restaurant`
##                new  my new restaurant             my new 
##               0.81               0.47               0.43 
## our new restaurant 
##               0.26

#findAssocs(x, terms, corlimit)

Subsetting with the Frequent words.

Converting the 0 with no and 1 with yes

# create DTMs with only the frequent terms
Data_dtm_freq_train <- dtm_train[ , Data_freq_words]
Data_dtm_freq_test <- dtm_test[ , Data_freq_words]
class(Data_dtm_freq_train)

## [1] "DocumentTermMatrix"    "simple_triplet_matrix"

#inspect(Data_dtm_freq_train)
#inspect(Data_dtm_freq_test)
# convert counts to a factor
convert_counts <- function(x) {
  x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
Data_train <- apply(Data_dtm_freq_train, MARGIN = 2, convert_counts)
Data_test  <- apply(Data_dtm_freq_test, MARGIN = 2, convert_counts)

Train the model

Predict the accuary

library(e1071)

t<-Data_test_labels
Data_classifier <- naiveBayes(Data_train, Data_train_labels,laplace = 1)
Data_test_pred <- predict(Data_classifier, Data_test)
confusionMatrix(Data_test_pred,t,positive = '1')

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 54  7
##          1  2 10
##                                          
##                Accuracy : 0.8767         
##                  95% CI : (0.7788, 0.942)
##     No Information Rate : 0.7671         
##     P-Value [Acc > NIR] : 0.01443        
##                                          
##                   Kappa : 0.6156         
##  Mcnemar's Test P-Value : 0.18242        
##                                          
##             Sensitivity : 0.5882         
##             Specificity : 0.9643         
##          Pos Pred Value : 0.8333         
##          Neg Pred Value : 0.8852         
##              Prevalence : 0.2329         
##          Detection Rate : 0.1370         
##    Detection Prevalence : 0.1644         
##       Balanced Accuracy : 0.7763         
##                                          
##        'Positive' Class : 1              
##

Testing

by sending the index

##########Testing#############

test<-function(col1){
  dtm_test<-dtm[col1,]
  Data_test_labels  <- Data[col1, ]$type
  Data_dtm_freq_test <- dtm_test[ ,Data_freq_words]
  # convert counts to a factor
  convert_counts <- function(x) {
    x <- ifelse(x > 0, "Yes", "No")
  }
  # apply() convert_counts() to columns of train/test data
  Data_test  <- apply(Data_dtm_freq_test, MARGIN = 2, convert_counts)
  a<-predict(Data_classifier, Data_test)
  return(a)
}
test(1)

##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1

Testing all 1’s data in the main data by passing it each at a time

ones<-which(Data$type==1)

result1<-data.frame()
result1<-NA
for(i in 1:length(ones)){
  result<-data.frame(test(ones[i]))
  result1<-cbind(result,result1)
}
length(result1)

## [1] 44

Text Mining with Naive Bayes and N-gram

Sujith Kumar

March 2, 2017

Importing libraries

Importing the data

Creating Corpus & cleaning it(we need the required stopwords so we exclude those words)

N-gram(1,2,3)

DTM

Training and Testing

Removing Sparse Terms

Finding the Frequency words btwn the range

Subsetting with the Frequent words.

Converting the 0 with no and 1 with yes

Train the model

Predict the accuary

Testing

by sending the index

Testing all 1’s data in the main data by passing it each at a time