There are mainly 4 methods, 1-Normal GLM. 2-Removing StopWords and Prune-Vocabulary with N-gram(1,2). 3-Feature Hashing(creates a text vectorizer function by using hash buckets) with N-gram(1,2). 4-Basic transform(Normalization with l1-Manhatten) with TFIDF

Loading Dataset

#https://cran.r-project.org/web/packages/text2vec/vignettes/text-vectorization.html
library(text2vec)
Data <- read.csv("~/NLP/final_data1_new.csv")
str(Data)

## 'data.frame':    184 obs. of  2 variables:
##  $ Message: Factor w/ 183 levels " He opened a new restaurant in a new city every year. ",..: 56 97 113 169 105 23 92 95 148 144 ...
##  $ label  : int  1 0 0 1 0 0 1 0 0 0 ...

colnames(Data)<-c("text","type")
Data$text<-as.character(Data$text)
head(Data)

##                                                                      text
## 1                                            I love my new restaurant!!!!
## 2                                   My daughter loves her new restaurant!
## 3 Opened a new restaurant on the ocassion of Xmas and people loved it.   
## 4                             we love to invite you to our new Restaurant
## 5                         New Restaurant !!! Customer loving the ambience
## 6        Does anyone know the new Restaurant opened last week in Bristol?
##   type
## 1    1
## 2    0
## 3    0
## 4    1
## 5    0
## 6    0

dim(Data)

## [1] 184   2

summary(Data)

##      text                type       
##  Length:184         Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.2337  
##                     3rd Qu.:0.0000  
##                     Max.   :1.0000

Training and Testing

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

set.seed(100)
inTrain1<-createDataPartition(Data$type,p=0.60,list=F)
train<-Data[inTrain1,]
test<-Data[-inTrain1,]

train<-cbind(train,id=rownames(train))
test<-cbind(test,id=rownames(test))
rownames(train)<-c(1:nrow(train))
rownames(test)<-c(1:nrow(test))
nrow(train)

## [1] 111

nrow(test)

## [1] 73

On Training part

1-Word Tokenizer

prep_fun = tolower
tok_fun = word_tokenizer

it_train = itoken(train$text, 
                  preprocessor = prep_fun, 
                  tokenizer = tok_fun, 
                  ids = train$id, 
                  progressbar = FALSE)
class(it_train)

## [1] "itoken"       "iterator"     "iter"         "abstractiter"
## [5] "R6"

2-Create vocabulary

3-Vocabulary Vectorization

4-Create DTM

vocab = create_vocabulary(it_train)
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.1736 secs

dim(dtm_train)

## [1] 111 566

Model1-Normal Nfold GLm

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-5

NFOLDS = 4
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']], 
                              family = 'binomial', 
                              # L1 penalty
                              alpha = 1,
                              # interested in the area under ROC curve
                              type.measure = "auc",
                              # 4-fold cross-validation
                              nfolds = NFOLDS,
                              # high value is less accurate, but has faster training
                              thresh = 1e-3,
                              # again lower number of iterations for faster training
                              maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.1736 secs

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

## [1] "max AUC = 0.8482"

Repeating the same steps on Testing part

it_test = test$text %>% 
  prep_fun %>% 
  tok_fun %>% 
  itoken(ids = test$id, 
         # turn off progressbar because it won't look nice in rmd
         progressbar = FALSE)

dtm_test = create_dtm(it_test, vectorizer)

Predicting

preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$type, preds)

## [1] 0.903753

test_result<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
confusionMatrix(test_result$X2,test_result$X1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 59 12
##          1  0  2
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.7305, 0.9121)
##     No Information Rate : 0.8082          
##     P-Value [Acc > NIR] : 0.337075        
##                                           
##                   Kappa : 0.2122          
##  Mcnemar's Test P-Value : 0.001496        
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.1429          
##          Pos Pred Value : 0.8310          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.8082          
##          Detection Rate : 0.8082          
##    Detection Prevalence : 0.9726          
##       Balanced Accuracy : 0.5714          
##                                           
##        'Positive' Class : 0               
##

Remove Stopwords

stop_words = c( "me", "myself", "ours", "ourselves", "you", "your", "yours","the")
t1 = Sys.time()
vocab = create_vocabulary(it_train, stopwords = stop_words)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.0170002 secs

Prune Vocabulary

-Throws out very frequent and very infrequent terms

Vectorize

pruned_vocab = prune_vocabulary(vocab, 
                                term_count_min = 10, 
                                doc_proportion_max = 0.5,
                                doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)

Creating DTM for Training and Testing

# create dtm_train with new pruned vocabulary vectorizer
t1 = Sys.time()
dtm_train  = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.1570001 secs

dim(dtm_train)

## [1] 111  24

dtm_test   = create_dtm(it_test, vectorizer)
dim(dtm_test)

## [1] 73 24

Method2-N-gram

t1 = Sys.time()
vocab = create_vocabulary(it_train, ngram = c(1L, 2L))
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.01559997 secs

vocab = vocab %>% prune_vocabulary(term_count_min = 10, 
                                   doc_proportion_max = 0.5)

bigram_vectorizer = vocab_vectorizer(vocab)

dtm_train = create_dtm(it_train, bigram_vectorizer)

Model

t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']], 
                              family = 'binomial', 
                              alpha = 1,
                              type.measure = "auc",
                              nfolds = NFOLDS,
                              thresh = 1e-3,
                              maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.07799983 secs

Testing

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

## [1] "max AUC = 0.9443"

# apply vectorizer
dtm_test = create_dtm(it_test, bigram_vectorizer)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
test_result1<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)

## [1] 0.8753027

confusionMatrix(test_result1$X2,test_result1$X1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 58  6
##          1  1  8
##                                           
##                Accuracy : 0.9041          
##                  95% CI : (0.8124, 0.9606)
##     No Information Rate : 0.8082          
##     P-Value [Acc > NIR] : 0.02046         
##                                           
##                   Kappa : 0.6419          
##  Mcnemar's Test P-Value : 0.13057         
##                                           
##             Sensitivity : 0.9831          
##             Specificity : 0.5714          
##          Pos Pred Value : 0.9062          
##          Neg Pred Value : 0.8889          
##              Prevalence : 0.8082          
##          Detection Rate : 0.7945          
##    Detection Prevalence : 0.8767          
##       Balanced Accuracy : 0.7772          
##                                           
##        'Positive' Class : 0               
##

Method3-Feature hashing with N-gram

hash_vectorizer-creats a text vectorizer function which used in constructing dtm/cropus

hash_size means hash_buckets

h_vectorizer = hash_vectorizer(hash_size = 2 ^ 14, ngram = c(1L, 2L))

t1 = Sys.time()
dtm_train = create_dtm(it_train, h_vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.1570001 secs

Model

t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']], 
                              family = 'binomial', 
                              alpha = 1,
                              type.measure = "auc",
                              nfolds = 5,
                              thresh = 1e-3,
                              maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 1.9766 secs

Testing

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

## [1] "max AUC = 0.8875"

dtm_test = create_dtm(it_test, h_vectorizer)

preds = predict(glmnet_classifier, dtm_test , type = 'response')[, 1]
test_result2<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)

## [1] 0.8807506

confusionMatrix(test_result2$X2,test_result2$X1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 58  6
##          1  1  8
##                                           
##                Accuracy : 0.9041          
##                  95% CI : (0.8124, 0.9606)
##     No Information Rate : 0.8082          
##     P-Value [Acc > NIR] : 0.02046         
##                                           
##                   Kappa : 0.6419          
##  Mcnemar's Test P-Value : 0.13057         
##                                           
##             Sensitivity : 0.9831          
##             Specificity : 0.5714          
##          Pos Pred Value : 0.9062          
##          Neg Pred Value : 0.8889          
##              Prevalence : 0.8082          
##          Detection Rate : 0.7945          
##    Detection Prevalence : 0.8767          
##       Balanced Accuracy : 0.7772          
##                                           
##        'Positive' Class : 0               
##

Basic transformations with TFIDF

Normalization

dtm_train_l1_norm = normalize(dtm_train, "l1")
#l1 is the Manhattan Normalization
#difference btwn 2 vectors or matrix
#TF-IDF
vocab = create_vocabulary(it_train)
vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)

TFIDF

# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
# tfidf modified by fit_transform() call!
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf  = create_dtm(it_test, vectorizer) %>% 
  transform(tfidf)

Model

t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = train[['type']], 
                              family = 'binomial', 
                              alpha = 1,
                              type.measure = "auc",
                              nfolds = NFOLDS,
                              thresh = 1e-3,
                              maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))

## Time difference of 0.1624 secs

Testing

plot(glmnet_classifier)

print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))

## [1] "max AUC = 0.9104"

preds = predict(glmnet_classifier, dtm_test_tfidf, type = 'response')[,1]
test_result3<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)

## [1] 0.8753027

confusionMatrix(test_result3$X2,test_result3$X1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 57 11
##          1  2  3
##                                           
##                Accuracy : 0.8219          
##                  95% CI : (0.7147, 0.9016)
##     No Information Rate : 0.8082          
##     P-Value [Acc > NIR] : 0.4530          
##                                           
##                   Kappa : 0.239           
##  Mcnemar's Test P-Value : 0.0265          
##                                           
##             Sensitivity : 0.9661          
##             Specificity : 0.2143          
##          Pos Pred Value : 0.8382          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.8082          
##          Detection Rate : 0.7808          
##    Detection Prevalence : 0.9315          
##       Balanced Accuracy : 0.5902          
##                                           
##        'Positive' Class : 0               
##

Summary

Among all 4 methods, second methods give accuary better than others.

Text Mining and Prediction with text2vec

Sujith Kumar

March 2, 2017

Loading Dataset

Training and Testing

On Training part

1-Word Tokenizer

2-Create vocabulary

3-Vocabulary Vectorization

4-Create DTM

Model1-Normal Nfold GLm

Repeating the same steps on Testing part

Predicting

Remove Stopwords

Prune Vocabulary

-Throws out very frequent and very infrequent terms

Vectorize

Creating DTM for Training and Testing

Method2-N-gram

Model

Testing

Method3-Feature hashing with N-gram

hash_vectorizer-creats a text vectorizer function which used in constructing dtm/cropus

hash_size means hash_buckets

Model

Testing

Basic transformations with TFIDF

Normalization

TFIDF

Model

Testing

Summary