HW4

###Section 1: Introduction and Data Preprocessing Briefly ###describe the classification problem ###and general data preprocessing steps.

library(tm)

## Loading required package: NLP

library(SnowballC)
library(wordcloud)

## Loading required package: RColorBrewer

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Could not load tcltk.  Will use slower R code instead.

## Loading required package: RSQLite

library(nnet)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(tinytex)
setwd("~/Desktop")
reviewsyelp<-read.csv("HW4_yelp_sentiment.csv", stringsAsFactors = FALSE)
str(reviewsyelp)

## 'data.frame':    988 obs. of  2 variables:
##  $ reviews  : chr  "Wow... Loved this place." "Crust is not good." "Not tasty and the texture was just nasty." "Stopped by during the late May bank holiday off Rick Steve recommendation and loved it." ...
##  $ sentiment: chr  "p" "n" "n" "p" ...

summary(reviewsyelp)

##    reviews           sentiment        
##  Length:988         Length:988        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

#data pre-pocessing
#covert sentiment in to ordered factors
reviewsyelp$sentiment = factor(reviewsyelp$sentiment, levels=c("p","n"))

###Tokenization

#dataset into corpus
reviewsyelp_corpus = Corpus(VectorSource(reviewsyelp$reviews))

#Next I would normalize the texts in the reviews using a series of pre-processing steps: 1. Switch to lower case 2. Remove numbers 3. Remove punctuation marks and stopwords 4. Remove extra whitespaces
reviewsyelp_corpus = tm_map(reviewsyelp_corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus,
## content_transformer(tolower)): transformation drops documents

reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removeNumbers):
## transformation drops documents

reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removePunctuation):
## transformation drops documents

reviewsyelp_corpus = tm_map(reviewsyelp_corpus, removeWords, c("the", "and", stopwords("english")))

## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, removeWords, c("the", :
## transformation drops documents

reviewsyelp_corpus = tm_map(reviewsyelp_corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(reviewsyelp_corpus, stripWhitespace):
## transformation drops documents

reviewsyelp_dtm <- DocumentTermMatrix(reviewsyelp_corpus)
reviewsyelp_dtm = removeSparseTerms(reviewsyelp_dtm, 0.99)
reviewsyelp_dtm_matrix<-as.matrix(reviewsyelp_dtm)
#combine sentiment with this matrix
reviewsyelp_dtm_matrix = cbind(reviewsyelp$sentiment, reviewsyelp_dtm_matrix)
reviewsyelp_df<-as.data.frame(reviewsyelp_dtm_matrix, stringsAsFactors = FALSE)
reviewsyelp_df$V1[reviewsyelp_df$V1==1] <- 0
reviewsyelp_df$V1[reviewsyelp_df$V1==2] <- 1
#Export into cvs
#write.csv(reviewsyelp_df, "reviewsyelp_DTM.csv")


####Trian and Test dataset

### Split data in testing and training
reviewsyelp_df$V1<-as.factor(reviewsyelp_df$V1)
#Setting seed
set.seed(100)
#Splitting data into 70:30 train and test

train_index <- createDataPartition(reviewsyelp_df$V1,p=0.7,list=FALSE)
trainData <- reviewsyelp_df[train_index,]
testData <- reviewsyelp_df[-train_index,]
rownames(testData)<-NULL
rownames(trainData)<-NULL

###Section 2: Logistic regression Apply

#Logistic regression Apply model
reviewsyelp.glm = glm(V1~ ., family = "binomial", data =trainData, maxit = 100);

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#Predict the model
pred.glm = as.numeric(predict(reviewsyelp.glm, testData, type="response") > 0.5)
pred.glm = as.factor(pred.glm)

#Evaluating the model
confusionMatrix(pred.glm, testData$V1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0  91  28
##          1  57 120
##                                           
##                Accuracy : 0.7128          
##                  95% CI : (0.6576, 0.7637)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 7.979e-14       
##                                           
##                   Kappa : 0.4257          
##                                           
##  Mcnemar's Test P-Value : 0.002389        
##                                           
##             Sensitivity : 0.6149          
##             Specificity : 0.8108          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 0.6780          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3074          
##    Detection Prevalence : 0.4020          
##       Balanced Accuracy : 0.7128          
##                                           
##        'Positive' Class : 0               
##

###Section 3: Artifical neural network and deep learning Experiment

#Fitting model on training data to predict from all variables
test <- multinom(V1~., data = trainData)

## # weights:  99 (98 variable)
## initial  value 479.657849 
## iter  10 value 272.539852
## iter  20 value 263.969986
## iter  30 value 263.415129
## iter  40 value 263.390320
## iter  50 value 263.388849
## final  value 263.388832 
## converged

#Predicting probability of Class for testing data
predicted=predict(test,testData,type="probs")
#Predicting Class for testing data
predictedCat <- levels(testData$V1)[max.col(predicted)]
#Confusion Matrix
table(testData$V1, predictedCat)

##    predictedCat
##       0
##   0 148
##   1 148

#Misclassification error
mean(as.character(testData$V1) != as.character(predictedCat))

## [1] 0.5

#### Accuracy

#Accuracy
mean(as.character(testData$V1) == as.character(predictedCat))

## [1] 0.5

### Section 4: Random Forest
#### Random Forest Classification

#Train Random forest classifier
model_rf<-randomForest(x=trainData, y=trainData$V1)
model_rf

## 
## Call:
##  randomForest(x = trainData, y = trainData$V1) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 9
## 
##         OOB estimate of  error rate: 0.14%
## Confusion matrix:
##     0   1 class.error
## 0 346   0 0.000000000
## 1   1 345 0.002890173

#Predict using Random forest Classifier
predict_rf <- predict(model_rf, newdata = testData)
confusionMatrix(predict_rf, testData$V1)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 148   0
##          1   0 148
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9876, 1)
##     No Information Rate : 0.5        
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0        
##             Specificity : 1.0        
##          Pos Pred Value : 1.0        
##          Neg Pred Value : 1.0        
##              Prevalence : 0.5        
##          Detection Rate : 0.5        
##    Detection Prevalence : 0.5        
##       Balanced Accuracy : 1.0        
##                                      
##        'Positive' Class : 0          
##

#Accuracy rate is 1

#Misclassification error
mean(as.character(testData$V1) != as.character(predict_rf))

## [1] 0

#### Accuracy
#Accuracy
mean(as.character(testData$V1) == as.character(predict_rf))

## [1] 1

###Section: Algorithm performance comparison #### Using the logistic regression this dataset accuracy is 71.28%. Using neural network model the accuracy is 50%. Using Random forest is has high accuracy is 100%. ### logistic regression is a linear regression. This looks like non-linear modle will leads high accuracy prediction. Thie random forest is better to do this.

Conclusion:

###Random Forest is more suitable for this dataset. It can do better prediction.

HW4

Jing Wang

12/8/2019

Conclusion: