There are mainly 4 methods, 1-Normal GLM. 2-Removing StopWords and Prune-Vocabulary with N-gram(1,2). 3-Feature Hashing(creates a text vectorizer function by using hash buckets) with N-gram(1,2). 4-Basic transform(Normalization with l1-Manhatten) with TFIDF
#https://cran.r-project.org/web/packages/text2vec/vignettes/text-vectorization.html
library(text2vec)
Data <- read.csv("~/NLP/final_data1_new.csv")
str(Data)
## 'data.frame': 184 obs. of 2 variables:
## $ Message: Factor w/ 183 levels " He opened a new restaurant in a new city every year. ",..: 56 97 113 169 105 23 92 95 148 144 ...
## $ label : int 1 0 0 1 0 0 1 0 0 0 ...
colnames(Data)<-c("text","type")
Data$text<-as.character(Data$text)
head(Data)
## text
## 1 I love my new restaurant!!!!
## 2 My daughter loves her new restaurant!
## 3 Opened a new restaurant on the ocassion of Xmas and people loved it.
## 4 we love to invite you to our new Restaurant
## 5 New Restaurant !!! Customer loving the ambience
## 6 Does anyone know the new Restaurant opened last week in Bristol?
## type
## 1 1
## 2 0
## 3 0
## 4 1
## 5 0
## 6 0
dim(Data)
## [1] 184 2
summary(Data)
## text type
## Length:184 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.2337
## 3rd Qu.:0.0000
## Max. :1.0000
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
set.seed(100)
inTrain1<-createDataPartition(Data$type,p=0.60,list=F)
train<-Data[inTrain1,]
test<-Data[-inTrain1,]
train<-cbind(train,id=rownames(train))
test<-cbind(test,id=rownames(test))
rownames(train)<-c(1:nrow(train))
rownames(test)<-c(1:nrow(test))
nrow(train)
## [1] 111
nrow(test)
## [1] 73
prep_fun = tolower
tok_fun = word_tokenizer
it_train = itoken(train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = train$id,
progressbar = FALSE)
class(it_train)
## [1] "itoken" "iterator" "iter" "abstractiter"
## [5] "R6"
vocab = create_vocabulary(it_train)
vectorizer = vocab_vectorizer(vocab)
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.1736 secs
dim(dtm_train)
## [1] 111 566
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-5
NFOLDS = 4
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 4-fold cross-validation
nfolds = NFOLDS,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.1736 secs
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.8482"
it_test = test$text %>%
prep_fun %>%
tok_fun %>%
itoken(ids = test$id,
# turn off progressbar because it won't look nice in rmd
progressbar = FALSE)
dtm_test = create_dtm(it_test, vectorizer)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
glmnet:::auc(test$type, preds)
## [1] 0.903753
test_result<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
confusionMatrix(test_result$X2,test_result$X1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 59 12
## 1 0 2
##
## Accuracy : 0.8356
## 95% CI : (0.7305, 0.9121)
## No Information Rate : 0.8082
## P-Value [Acc > NIR] : 0.337075
##
## Kappa : 0.2122
## Mcnemar's Test P-Value : 0.001496
##
## Sensitivity : 1.0000
## Specificity : 0.1429
## Pos Pred Value : 0.8310
## Neg Pred Value : 1.0000
## Prevalence : 0.8082
## Detection Rate : 0.8082
## Detection Prevalence : 0.9726
## Balanced Accuracy : 0.5714
##
## 'Positive' Class : 0
##
stop_words = c( "me", "myself", "ours", "ourselves", "you", "your", "yours","the")
t1 = Sys.time()
vocab = create_vocabulary(it_train, stopwords = stop_words)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.0170002 secs
pruned_vocab = prune_vocabulary(vocab,
term_count_min = 10,
doc_proportion_max = 0.5,
doc_proportion_min = 0.001)
vectorizer = vocab_vectorizer(pruned_vocab)
# create dtm_train with new pruned vocabulary vectorizer
t1 = Sys.time()
dtm_train = create_dtm(it_train, vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.1570001 secs
dim(dtm_train)
## [1] 111 24
dtm_test = create_dtm(it_test, vectorizer)
dim(dtm_test)
## [1] 73 24
t1 = Sys.time()
vocab = create_vocabulary(it_train, ngram = c(1L, 2L))
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.01559997 secs
vocab = vocab %>% prune_vocabulary(term_count_min = 10,
doc_proportion_max = 0.5)
bigram_vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, bigram_vectorizer)
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']],
family = 'binomial',
alpha = 1,
type.measure = "auc",
nfolds = NFOLDS,
thresh = 1e-3,
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.07799983 secs
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.9443"
# apply vectorizer
dtm_test = create_dtm(it_test, bigram_vectorizer)
preds = predict(glmnet_classifier, dtm_test, type = 'response')[,1]
test_result1<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)
## [1] 0.8753027
confusionMatrix(test_result1$X2,test_result1$X1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 58 6
## 1 1 8
##
## Accuracy : 0.9041
## 95% CI : (0.8124, 0.9606)
## No Information Rate : 0.8082
## P-Value [Acc > NIR] : 0.02046
##
## Kappa : 0.6419
## Mcnemar's Test P-Value : 0.13057
##
## Sensitivity : 0.9831
## Specificity : 0.5714
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.8889
## Prevalence : 0.8082
## Detection Rate : 0.7945
## Detection Prevalence : 0.8767
## Balanced Accuracy : 0.7772
##
## 'Positive' Class : 0
##
h_vectorizer = hash_vectorizer(hash_size = 2 ^ 14, ngram = c(1L, 2L))
t1 = Sys.time()
dtm_train = create_dtm(it_train, h_vectorizer)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.1570001 secs
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train, y = train[['type']],
family = 'binomial',
alpha = 1,
type.measure = "auc",
nfolds = 5,
thresh = 1e-3,
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 1.9766 secs
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.8875"
dtm_test = create_dtm(it_test, h_vectorizer)
preds = predict(glmnet_classifier, dtm_test , type = 'response')[, 1]
test_result2<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)
## [1] 0.8807506
confusionMatrix(test_result2$X2,test_result2$X1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 58 6
## 1 1 8
##
## Accuracy : 0.9041
## 95% CI : (0.8124, 0.9606)
## No Information Rate : 0.8082
## P-Value [Acc > NIR] : 0.02046
##
## Kappa : 0.6419
## Mcnemar's Test P-Value : 0.13057
##
## Sensitivity : 0.9831
## Specificity : 0.5714
## Pos Pred Value : 0.9062
## Neg Pred Value : 0.8889
## Prevalence : 0.8082
## Detection Rate : 0.7945
## Detection Prevalence : 0.8767
## Balanced Accuracy : 0.7772
##
## 'Positive' Class : 0
##
dtm_train_l1_norm = normalize(dtm_train, "l1")
#l1 is the Manhattan Normalization
#difference btwn 2 vectors or matrix
#TF-IDF
vocab = create_vocabulary(it_train)
vectorizer = vocab_vectorizer(vocab)
dtm_train = create_dtm(it_train, vectorizer)
# define tfidf model
tfidf = TfIdf$new()
# fit model to train data and transform train data with fitted model
dtm_train_tfidf = fit_transform(dtm_train, tfidf)
# tfidf modified by fit_transform() call!
# apply pre-trained tf-idf transformation to test data
dtm_test_tfidf = create_dtm(it_test, vectorizer) %>%
transform(tfidf)
t1 = Sys.time()
glmnet_classifier = cv.glmnet(x = dtm_train_tfidf, y = train[['type']],
family = 'binomial',
alpha = 1,
type.measure = "auc",
nfolds = NFOLDS,
thresh = 1e-3,
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'sec'))
## Time difference of 0.1624 secs
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
## [1] "max AUC = 0.9104"
preds = predict(glmnet_classifier, dtm_test_tfidf, type = 'response')[,1]
test_result3<-data.frame(cbind(test$type,ifelse(preds>0.5,1,0)))
glmnet:::auc(test$type, preds)
## [1] 0.8753027
confusionMatrix(test_result3$X2,test_result3$X1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 57 11
## 1 2 3
##
## Accuracy : 0.8219
## 95% CI : (0.7147, 0.9016)
## No Information Rate : 0.8082
## P-Value [Acc > NIR] : 0.4530
##
## Kappa : 0.239
## Mcnemar's Test P-Value : 0.0265
##
## Sensitivity : 0.9661
## Specificity : 0.2143
## Pos Pred Value : 0.8382
## Neg Pred Value : 0.6000
## Prevalence : 0.8082
## Detection Rate : 0.7808
## Detection Prevalence : 0.9315
## Balanced Accuracy : 0.5902
##
## 'Positive' Class : 0
##
Among all 4 methods, second methods give accuary better than others.