###Section 1: Introduction and Data Preprocessing Briefly ###describe the classification problem ###and general data preprocessing steps.
library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Could not load tcltk. Will use slower R code instead.
## Loading required package: RSQLite
library(nnet)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(tinytex)
setwd("~/Desktop")
reviewsyelp<-read.csv("HW4_yelp_sentiment.csv", stringsAsFactors = FALSE)
str(reviewsyelp)
## 'data.frame': 988 obs. of 2 variables:
## $ reviews : chr "Wow... Loved this place." "Crust is not good." "Not tasty and the texture was just nasty." "Stopped by during the late May bank holiday off Rick Steve recommendation and loved it." ...
## $ sentiment: chr "p" "n" "n" "p" ...
summary(reviewsyelp)
## reviews sentiment
## Length:988 Length:988
## Class :character Class :character
## Mode :character Mode :character
#data pre-pocessing
#covert sentiment in to ordered factors
reviewsyelp$sentiment = factor(reviewsyelp$sentiment, levels=c("p","n"))
###Tokenization
#dataset into corpus
reviewsyelp_corpus = Corpus(VectorSource(reviewsyelp$reviews))
#Next I would normalize the texts in the reviews using a series of pre-processing steps: 1. Switch to lower case 2. Remove numbers 3. Remove punctuation marks and stopwords 4. Remove extra whitespaces
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus,
## content_transformer(tolower)): transformation drops documents
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removeNumbers):
## transformation drops documents
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removePunctuation):
## transformation drops documents
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removeWords, c("the", "and", stopwords("english")))
## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removeWords, c("the", :
## transformation drops documents
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, stripWhitespace):
## transformation drops documents
reviewsyelp_dtm <- DocumentTermMatrix(reviewsyelp_corpus)
reviewsyelp_dtm = removeSparseTerms(reviewsyelp_dtm, 0.99)
reviewsyelp_dtm_matrix<-as.matrix(reviewsyelp_dtm)
#combine sentiment with this matrix
reviewsyelp_dtm_matrix = cbind(reviewsyelp$sentiment, reviewsyelp_dtm_matrix)
reviewsyelp_df<-as.data.frame(reviewsyelp_dtm_matrix, stringsAsFactors = FALSE)
reviewsyelp_df$V1[reviewsyelp_df$V1==1] <- 0
reviewsyelp_df$V1[reviewsyelp_df$V1==2] <- 1
#Export into cvs
#write.csv(reviewsyelp_df, "reviewsyelp_DTM.csv")
####Trian and Test dataset
### Split data in testing and training
reviewsyelp_df$V1<-as.factor(reviewsyelp_df$V1)
#Setting seed
set.seed(100)
#Splitting data into 70:30 train and test
train_index <- createDataPartition(reviewsyelp_df$V1,p=0.7,list=FALSE)
trainData <- reviewsyelp_df[train_index,]
testData <- reviewsyelp_df[-train_index,]
rownames(testData)<-NULL
rownames(trainData)<-NULL
###Section 2: Logistic regression Apply
#Logistic regression Apply model
reviewsyelp.glm = glm(V1~ ., family = "binomial", data =trainData, maxit = 100);
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Predict the model
pred.glm = as.numeric(predict(reviewsyelp.glm, testData, type="response") > 0.5)
pred.glm = as.factor(pred.glm)
#Evaluating the model
confusionMatrix(pred.glm, testData$V1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 91 28
## 1 57 120
##
## Accuracy : 0.7128
## 95% CI : (0.6576, 0.7637)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 7.979e-14
##
## Kappa : 0.4257
##
## Mcnemar's Test P-Value : 0.002389
##
## Sensitivity : 0.6149
## Specificity : 0.8108
## Pos Pred Value : 0.7647
## Neg Pred Value : 0.6780
## Prevalence : 0.5000
## Detection Rate : 0.3074
## Detection Prevalence : 0.4020
## Balanced Accuracy : 0.7128
##
## 'Positive' Class : 0
##
###Section 3: Artifical neural network and deep learning Experiment
#Fitting model on training data to predict from all variables
test <- multinom(V1~., data = trainData)
## # weights: 99 (98 variable)
## initial value 479.657849
## iter 10 value 272.539852
## iter 20 value 263.969986
## iter 30 value 263.415129
## iter 40 value 263.390320
## iter 50 value 263.388849
## final value 263.388832
## converged
#Predicting probability of Class for testing data
predicted=predict(test,testData,type="probs")
#Predicting Class for testing data
predictedCat <- levels(testData$V1)[max.col(predicted)]
#Confusion Matrix
table(testData$V1, predictedCat)
## predictedCat
## 0
## 0 148
## 1 148
#Misclassification error
mean(as.character(testData$V1) != as.character(predictedCat))
## [1] 0.5
#### Accuracy
#Accuracy
mean(as.character(testData$V1) == as.character(predictedCat))
## [1] 0.5
### Section 4: Random Forest
#### Random Forest Classification
#Train Random forest classifier
model_rf<-randomForest(x=trainData, y=trainData$V1)
model_rf
##
## Call:
## randomForest(x = trainData, y = trainData$V1)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 9
##
## OOB estimate of error rate: 0.14%
## Confusion matrix:
## 0 1 class.error
## 0 346 0 0.000000000
## 1 1 345 0.002890173
#Predict using Random forest Classifier
predict_rf <- predict(model_rf, newdata = testData)
confusionMatrix(predict_rf, testData$V1)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 148 0
## 1 0 148
##
## Accuracy : 1
## 95% CI : (0.9876, 1)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0
## Specificity : 1.0
## Pos Pred Value : 1.0
## Neg Pred Value : 1.0
## Prevalence : 0.5
## Detection Rate : 0.5
## Detection Prevalence : 0.5
## Balanced Accuracy : 1.0
##
## 'Positive' Class : 0
##
#Accuracy rate is 1
#Misclassification error
mean(as.character(testData$V1) != as.character(predict_rf))
## [1] 0
#### Accuracy
#Accuracy
mean(as.character(testData$V1) == as.character(predict_rf))
## [1] 1
###Section: Algorithm performance comparison #### Using the logistic regression this dataset accuracy is 71.28%. Using neural network model the accuracy is 50%. Using Random forest is has high accuracy is 100%. ### logistic regression is a linear regression. This looks like non-linear modle will leads high accuracy prediction. Thie random forest is better to do this.
###Random Forest is more suitable for this dataset. It can do better prediction.