Importing libraries
library(readr)
library(stringr)
library(SnowballC)
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Attaching package: 'RTextTools'
## The following objects are masked from 'package:SnowballC':
##
## getStemLanguages, wordStem
library(tm)
## Loading required package: NLP
library(SnowballC)
library(tau)
##
## Attaching package: 'tau'
## The following object is masked from 'package:readr':
##
## tokenize
Importing the data
Data <- read.csv("~/NLP/final_data1_new.csv")
str(Data)
## 'data.frame': 184 obs. of 2 variables:
## $ Message: Factor w/ 183 levels " He opened a new restaurant in a new city every year. ",..: 56 97 113 169 105 23 92 95 148 144 ...
## $ label : int 1 0 0 1 0 0 1 0 0 0 ...
colnames(Data)<-c("text","type")
Data$text<-as.character(Data$text)
Data$type<-as.factor(Data$type)
head(Data)
## text
## 1 I love my new restaurant!!!!
## 2 My daughter loves her new restaurant!
## 3 Opened a new restaurant on the ocassion of Xmas and people loved it.
## 4 we love to invite you to our new Restaurant
## 5 New Restaurant !!! Customer loving the ambience
## 6 Does anyone know the new Restaurant opened last week in Bristol?
## type
## 1 1
## 2 0
## 3 0
## 4 1
## 5 0
## 6 0
Creating Corpus & cleaning it(we need the required stopwords so we exclude those words)
#Creating Corpus
corpusHS <- Corpus(VectorSource(Data$text))
corpusHS = tm_map(corpusHS, removeNumbers)
corpusHS = tm_map(corpusHS, str_replace_all, pattern="[[:punct:]]", replacement=" ")
exceptions <- c("i","my","our")
my_stopwords <- setdiff(stopwords("en"), exceptions)
corpusHS = tm_map(corpusHS, removeWords, words=my_stopwords)
corpusHS = tm_map(corpusHS, tolower)
#corpusHS = tm_map(corpusHS, stemDocument)
corpusHS = tm_map(corpusHS, PlainTextDocument)
corpusHS[[25]]$content
## [1] "people loving my new star restaurant luxury star expense star "
corpusHS[[1]]$content
## [1] "i love my new restaurant "
Data[25,1]
## [1] "people are loving my new 3 star restaurant too as it is luxury of 5 star at the expense of 3 star. "
Data[1,1]
## [1] "I love my new restaurant!!!!"
N-gram(1,2,3)
DTM
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), c(1,2,3)), paste, collapse = " "), use.names = FALSE)
dtm <- DocumentTermMatrix(corpusHS, control = list(tokenize = BigramTokenizer))
#dtm <- DocumentTermMatrix(corpusHS, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE)))
Training and Testing
#Training and Testing
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
set.seed(100)
inTrain1<-createDataPartition(Data$type,p=0.60,list=F)
dtm_train<-dtm[inTrain1,]
dtm_test<-dtm[-inTrain1,]
nrow(dtm_train)
## [1] 111
nrow(dtm_test)
## [1] 73
Data_train_labels <- Data[inTrain1, ]$type
Data_test_labels <- Data[-inTrain1, ]$type
Removing Sparse Terms
#http://stats.stackexchange.com/questions/160539/is-this-interpretation-of-sparsity-accurate
#if 0.9999 then words with 0 count also present.
#if 0.01 then words with high count(that word present in each doc)
Data_dtm_freq_train <- removeSparseTerms(dtm_train, 0.999)
Data_dtm_freq_train
## <<DocumentTermMatrix (documents: 111, terms: 1876)>>
## Non-/sparse entries: 2458/205778
## Sparsity : 99%
## Maximal term length: 34
## Weighting : term frequency (tf)
Finding the Frequency words btwn the range
#findFreqTerms(dtm_train,2,30)
#findFreqTerms(x, lowfreq = 0, highfreq = Inf)
Data_freq_words <- findFreqTerms(dtm_train,2,30)
#?findAssocs()
findAssocs(dtm,"my new",0.3)
## $`my new`
## my new restaurant love my new new restaurant new
## 0.94 0.53 0.43 0.37
## i can without
## 0.30 0.30
findAssocs(dtm,"new restaurant",0.25)
## $`new restaurant`
## new my new restaurant my new
## 0.81 0.47 0.43
## our new restaurant
## 0.26
#findAssocs(x, terms, corlimit)
Subsetting with the Frequent words.
Converting the 0 with no and 1 with yes
# create DTMs with only the frequent terms
Data_dtm_freq_train <- dtm_train[ , Data_freq_words]
Data_dtm_freq_test <- dtm_test[ , Data_freq_words]
class(Data_dtm_freq_train)
## [1] "DocumentTermMatrix" "simple_triplet_matrix"
#inspect(Data_dtm_freq_train)
#inspect(Data_dtm_freq_test)
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
Data_train <- apply(Data_dtm_freq_train, MARGIN = 2, convert_counts)
Data_test <- apply(Data_dtm_freq_test, MARGIN = 2, convert_counts)
Train the model
Predict the accuary
library(e1071)
t<-Data_test_labels
Data_classifier <- naiveBayes(Data_train, Data_train_labels,laplace = 1)
Data_test_pred <- predict(Data_classifier, Data_test)
confusionMatrix(Data_test_pred,t,positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 54 7
## 1 2 10
##
## Accuracy : 0.8767
## 95% CI : (0.7788, 0.942)
## No Information Rate : 0.7671
## P-Value [Acc > NIR] : 0.01443
##
## Kappa : 0.6156
## Mcnemar's Test P-Value : 0.18242
##
## Sensitivity : 0.5882
## Specificity : 0.9643
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.8852
## Prevalence : 0.2329
## Detection Rate : 0.1370
## Detection Prevalence : 0.1644
## Balanced Accuracy : 0.7763
##
## 'Positive' Class : 1
##
Testing
by sending the index
##########Testing#############
test<-function(col1){
dtm_test<-dtm[col1,]
Data_test_labels <- Data[col1, ]$type
Data_dtm_freq_test <- dtm_test[ ,Data_freq_words]
# convert counts to a factor
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
# apply() convert_counts() to columns of train/test data
Data_test <- apply(Data_dtm_freq_test, MARGIN = 2, convert_counts)
a<-predict(Data_classifier, Data_test)
return(a)
}
test(1)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [106] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [141] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
Testing all 1’s data in the main data by passing it each at a time
ones<-which(Data$type==1)
result1<-data.frame()
result1<-NA
for(i in 1:length(ones)){
result<-data.frame(test(ones[i]))
result1<-cbind(result,result1)
}
length(result1)
## [1] 44