Import the file csv

# train data set
data1<-read.csv("D:/1 Teaching Material/R/case studies/Sentiment/Problem data set/train.csv", header = TRUE, sep = "~", stringsAsFactors = FALSE)
str(data1)

## 'data.frame':    30172 obs. of  5 variables:
##  $ User_ID     : int  11755 33912 10143 33114 17464 34367 14524 35130 1004 27086 ...
##  $ Description : chr  "After reading mixed reviews I almost didn't book at the W, but I was attending a concert at the Pantages Theatr"| __truncated__ "This motor inn is located about - city blocks from Fisherman's Warf and about - blocks from the Embarcadero. Yo"| __truncated__ "It was our first time there and surely not our last. Arrived very early off the train and went there to drop of"| __truncated__ "Great hotel in an excellent location, just off Times Square and next door to the Nederlander Theatre (home of t"| __truncated__ ...
##  $ Browser_Used: chr  "Google Chrome" "Firefox" "Google Chrome" "Mozilla" ...
##  $ Device_Used : chr  "Desktop" "Tablet" "Mobile" "Desktop" ...
##  $ Is_Response : chr  "Good" "Good" "Good" "Good" ...

#----------------------------------------
# test data set
tdata1<-read.csv("D:/1 Teaching Material/R/case studies/Sentiment/Problem data set/test.csv", header = TRUE, sep = "~", stringsAsFactors = FALSE)
str(tdata1)

## 'data.frame':    8760 obs. of  4 variables:
##  $ User_ID     : int  9602 8749 15500 5495 18570 35280 23513 15477 6505 34730 ...
##  $ Description : chr  "A friend and I stayed in this hotel when we were in Jacksonville for a concert. I always agonize over choosing "| __truncated__ "I enjoy staying here when I have early flights. Service is good, clean, easy in and out, imcludimg separate par"| __truncated__ "I stopped off in Seattle during a train tour of the west and had such a great experience. When I arrived in my "| __truncated__ "I have stayed at this hotel - or - times now for business travel and plan to again in a few weeks. Always very "| __truncated__ ...
##  $ Browser_Used: chr  "Edge" "Google Chrome" "Chrome" "Mozilla Firefox" ...
##  $ Device_Used : chr  "Desktop" "Mobile" "Mobile" "Desktop" ...

#str(data1)
#summary(data1)
# head(data1$Description)
# hidden line breaks to be removed from the data

data1$Description <- sapply(data1$Description,
                                    function(x) 
                                      { gsub("[\r\n]", "", x) 
                                      }
                            )
#str(data1$Description)
#-----------------------------#

tdata1$Description <- sapply(tdata1$Description,
                                    function(x) 
                                      { gsub("[\r\n]", "", x) 
                                      }
                            )
#str(tdata1$Description)

Sentiment analysis

library(sentimentr)
library(lexicon)

## 
## Attaching package: 'lexicon'

## The following object is masked from 'package:sentimentr':
## 
##     available_data

library(syuzhet)

## 
## Attaching package: 'syuzhet'

## The following object is masked from 'package:sentimentr':
## 
##     get_sentences

h1<-sentiment_by(data1$Description)

## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.

# str(h1)
#------------------

th1<-sentiment_by(tdata1$Description)

## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory.  It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.

# str(th1)

h2<-get_sentiment(data1$Description, method="syuzhet")
#str(h2)
#summary(h2)
#---------------------------
th2<-get_sentiment(tdata1$Description, method="syuzhet")
#str(th2)
#summary(th2)

h3<-get_sentiment(data1$Description, method="bing")
#str(h1)
#-------------
th3<-get_sentiment(tdata1$Description, method="bing")
#str(th1)

h4<-get_sentiment(data1$Description, method="afinn")
#str(h1)
#--------------------
th4<-get_sentiment(tdata1$Description, method="afinn")
#str(th1)

h5<-get_sentiment(data1$Description, method="nrc", lang = "english")
#str(h5)
#-------------------------
th5<-get_sentiment(tdata1$Description, method="nrc", lang = "english")
#str(th5)

combining the sentiments of all the methods and taking average of it

avg_sent<-cbind(h1$ave_sentiment,h2, h3,h4,h5)

#-------------------------------
tavg_sent<-cbind(th1$ave_sentiment,th2, th3,th4,th5)

since all the different methods of sentients have different scale convert them to standard one, -1, 0, 1 using sign function

h11<-sign(h1$ave_sentiment)
h21<-sign(h2)
h31<-sign(h3)
h41<-sign(h4)
h51<-sign(h5)
#--------------------------
th11<-sign(th1$ave_sentiment)
th21<-sign(th2)
th31<-sign(th3)
th41<-sign(th4)
th51<-sign(th5)

combine the sentiments

avg_sent<-cbind(h11,h21, h31,h41,h51)
#str(avg_sent)
#dim(avg_sent)
#--------------------------

tavg_sent<-cbind(th11,th21, th31,th41,th51)
#str(tavg_sent)
#dim(tavg_sent)

calculate the average and sum of the sentiment of h1 to h5

avg<-rowMeans(avg_sent[,1:5])
#str(avg)
#---------------------
tavg<-rowMeans(tavg_sent[,1:5])
#str(tavg)

# convert the average sentiment into categorical variable as positive sentiment, negative sentiment and neutral

sent1<-sign(avg)
sent1<-as.factor(sent1)
#-----------------------
tsent1<-sign(tavg)
tsent1<-as.factor(tsent1)

# label the sentiment
library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:rpart':
## 
##     solder

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:e1071':
## 
##     impute

## The following objects are masked from 'package:base':
## 
##     format.pval, units

sent1<-factor(sent1, levels = c(-1,0,1), labels = c("Negative", "Neutral", "Positive"))
#str(sent1)
#---------------------------
tsent1<-factor(tsent1, levels = c(-1,0,1), labels = c("Negative", "Neutral", "Positive"))
#str(tsent1)

final data set for analysis

data2<-cbind.data.frame(data1,sent1)
#str(data2)
#--------------------
tdata2<-cbind.data.frame(tdata1,tsent1)
#str(tdata2)

# remove data which is not required for model building
data3<-data2[,-2]
#str(data3)
#head(data3)
#----------------------------
tdata3<-tdata2[,-2]
#str(tdata3)
#head(tdata3)
#str(tdata3)

# convert the data into factor
data3$Browser_Used<-as.factor(data3$Browser_Used)
data3$Device_Used<-as.factor(data3$Device_Used)
data3$Is_Response<-as.factor(data3$Is_Response)
#-------------------------------------
tdata3$Browser_Used<-as.factor(tdata3$Browser_Used)
tdata3$Device_Used<-as.factor(tdata3$Device_Used)

Analysis of the data

# Descriptive statistics
#summary(data3)
#table(data3$Browser_Used)
# problem with categories, few categories had to be combined

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# recode the data for similar browser names
data3$Browser_Used<-recode(data3$Browser_Used, Chrome = "Google Chrome")
data3$Browser_Used<-recode(data3$Browser_Used, InternetExplorer = "Internet Explorer")
data3$Browser_Used<-recode(data3$Browser_Used, Mozilla = "Mozilla Firefox")
data3$Browser_Used<-recode(data3$Browser_Used, IE = "Internet Explorer")
table(data3$Browser_Used)

## 
##     Google Chrome              Edge           Firefox Internet Explorer 
##              5478              5530              5754              7093 
##   Mozilla Firefox             Opera            Safari 
##              5737               274               306

table(data3$Device_Used)

## 
## Desktop  Mobile  Tablet 
##   11630   11639    6903

table(data3$Is_Response)

## 
##   Bad  Good 
##  9605 20567

table(data3$sent1)

## 
## Negative  Neutral Positive 
##     2436      353    27383

#-----------------

tdata3$Browser_Used<-recode(tdata3$Browser_Used, Chrome = "Google Chrome")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, InternetExplorer = "Internet Explorer")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, Mozilla = "Mozilla Firefox")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, IE = "Internet Explorer")

table(tdata3$Browser_Used)

## 
##     Google Chrome              Edge           Firefox Internet Explorer 
##              1651              1604              1613              2037 
##   Mozilla Firefox             Opera            Safari 
##              1683                88                84

table(tdata3$Device_Used)

## 
## Desktop  Mobile  Tablet 
##    3396    3337    2027

table(tdata3$tsent1)

## 
## Negative  Neutral Positive 
##      725      115     7920

Comparing the response variables with the other variables using table and plots

tab1<-table(data3$Is_Response, data3$Device_Used)
tab1

##       
##        Desktop Mobile Tablet
##   Bad     3417   3418   2770
##   Good    8213   8221   4133

chisq.test(tab1)

## 
##  Pearson's Chi-squared test
## 
## data:  tab1
## X-squared = 283.7, df = 2, p-value < 2.2e-16

ggplot(data3, aes(Device_Used, fill= Is_Response))+geom_bar()

ggplot(data3, aes(Device_Used, fill= Is_Response))+geom_bar(position = "dodge")

tab2<-table(data3$Is_Response, data3$Browser_Used)
tab2

##       
##        Google Chrome Edge Firefox Internet Explorer Mozilla Firefox Opera
##   Bad            755 3030    2881               949            1815    92
##   Good          4723 2500    2873              6144            3922   182
##       
##        Safari
##   Bad      83
##   Good    223

chisq.test(tab2)

## 
##  Pearson's Chi-squared test
## 
## data:  tab2
## X-squared = 4164.4, df = 6, p-value < 2.2e-16

ggplot(data3, aes(Browser_Used, fill= Is_Response))+geom_bar()

ggplot(data3, aes(Browser_Used, fill= Is_Response))+geom_bar(position = "dodge")

tab3<-table(data3$Is_Response, data3$sent1)
tab3

##       
##        Negative Neutral Positive
##   Bad      2315     299     6991
##   Good      121      54    20392

chisq.test(tab3)

## 
##  Pearson's Chi-squared test
## 
## data:  tab3
## X-squared = 5439.8, df = 2, p-value < 2.2e-16

ggplot(data3, aes(sent1, fill= Is_Response))+geom_bar()

ggplot(data3, aes(sent1, fill= Is_Response))+geom_bar(position = "dodge")

Applying Model

Decision tree

library(rpart)
library(rpart.plot)
library(caret)

# decision tree

trctrl <- trainControl(method = "cv", number = 10)

dtm1<-train(Is_Response~Browser_Used+Device_Used+sent1, 
                   data = data3, 
                   method = "rpart",
                   parms = list(split = "information"),
                   trControl=trctrl,
                   tuneLength = 10)
dtm1

## CART 
## 
## 30172 samples
##     3 predictor
##     2 classes: 'Bad', 'Good' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 27156, 27156, 27154, 27154, 27156, 27155, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.00000000  0.7680292  0.3890201
##   0.02821447  0.7624948  0.3250137
##   0.05642894  0.7624948  0.3250137
##   0.08464341  0.7624948  0.3250137
##   0.11285789  0.7624948  0.3250137
##   0.14107236  0.7624948  0.3250137
##   0.16928683  0.7624948  0.3250137
##   0.19750130  0.7624948  0.3250137
##   0.22571577  0.7624948  0.3250137
##   0.25393024  0.7195741  0.1539621
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

# using the train data set predict the model
#str(tdata3)
# since the variable sent1 is named tsent1 we have to change it 
tdata3$sent1<-tdata3$tsent1
# remove the extra variable,
tdata3<-tdata3[,-4]
#str(tdata3)

dtm1_predict<-predict(dtm1, tdata3)
#dtm1_predict

# create a new data set with predicted values for test data set
tdatapredicted<-tdata3
tdatapredicted$responce<-dtm1_predict
tdatapredicted<-tdatapredicted[,-c(2:4)]
#str(tdatapredicted)

# store the output in a file
write.table(tdatapredicted, "D:/1 Teaching Material/R/case studies/Sentiment/solution/output_decision_tree.csv", row.names = TRUE, sep = "~")

convert categorical data into dummy

library(caret)

d1<-dummyVars(~Browser_Used+Device_Used+sent1, data = data3)
#predict the values of dummy from the data
dummies <- predict(d1, newdata = data3)

# add dummy to the actual data
data4<-cbind(data3,dummies)
#str(data4)
# remove the browser, device and sent1 variable from the data set
data5<-data4[,-c(2,3,5)]
#str(data5)
#-----------------------------------------------
d2<-dummyVars(~Browser_Used+Device_Used+sent1, data = tdata3)
#predict the values of dummy from the data
dummies2 <- predict(d2, newdata = tdata3)

# add dummy to the actua data
tdata4<-cbind(tdata3,dummies2)
#str(tdata4)
# remove the browser, device and sent1
tdata5<-tdata4[,-c(2,3,4)]
#str(tdata5)
#str(data5)

Support Vector MAchine

library(caret)

trctrl <- trainControl(method = "cv", number = 5)

dtm2<- train(Is_Response~., data = data5, method = "svmLinear",
                 trControl=trctrl,
                 preProcess = c("center", "scale"),
                 tuneLength = 10)

dtm2

## Support Vector Machines with Linear Kernel 
## 
## 30172 samples
##    14 predictor
##     2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (14), scaled (14) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 24138, 24137, 24138, 24137, 24138 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7624949  0.3250868
## 
## Tuning parameter 'C' was held constant at a value of 1

# predict the values
dtm2_predict<-predict(dtm2,tdata5)

# create a new data set with predicted values for test data set
tdatapredicted2<-tdata3
tdatapredicted2$responce<-dtm2_predict
str(tdatapredicted2)

## 'data.frame':    8760 obs. of  5 variables:
##  $ User_ID     : int  9602 8749 15500 5495 18570 35280 23513 15477 6505 34730 ...
##  $ Browser_Used: Factor w/ 7 levels "Google Chrome",..: 2 1 1 5 2 4 2 3 3 1 ...
##  $ Device_Used : Factor w/ 3 levels "Desktop","Mobile",..: 1 2 2 1 2 1 3 3 2 3 ...
##  $ sent1       : Factor w/ 3 levels "Negative","Neutral",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ responce    : Factor w/ 2 levels "Bad","Good": 2 2 2 2 2 2 2 2 2 2 ...

tdatapredicted2<-tdatapredicted2[,-c(2:4)]

# save the output
write.table(tdatapredicted2, "file:///D:/1 Teaching Material/R/case studies/Sentiment/solution/output_SVM.csv", row.names = TRUE, sep = "~")

KNN

fitControl = trainControl(method="cv", number = 10)
# apply the knn on the whole data

knnMod2 = train(Is_Response~., data=data5,
                method="knn",
                trControl=fitControl,
                preProcess=c("center","scale"),
                tuneLength=10)

print(knnMod2)

## k-Nearest Neighbors 
## 
## 30172 samples
##    14 predictor
##     2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (14), scaled (14) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 27155, 27155, 27155, 27155, 27154, 27155, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.7373730  0.3551037
##    7  0.7431727  0.3621451
##    9  0.7466528  0.3645643
##   11  0.7505305  0.3716171
##   13  0.7517572  0.3724535
##   15  0.7548062  0.3778595
##   17  0.7547071  0.3756271
##   19  0.7560990  0.3775276
##   21  0.7577893  0.3802678
##   23  0.7560989  0.3732679
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.

plot(knnMod2)

# predict the values
knnMod2_predict<-predict(knnMod2,tdata5)

#create a new data set with predicted values for test data set
tdatapredicted3<-tdata3
tdatapredicted3$response<-knnMod2_predict
#str(tdatapredicted3)
tdatapredicted3<-tdatapredicted3[,-c(2:4)]

write.table(tdatapredicted3, "file:///D:/1 Teaching Material/R/case studies/Sentiment/solution/output_knn.csv", row.names = TRUE, sep = "~")

Cases Study - Sentiment analysis

Dr Manohar Kapse

9 March 2019

convert categorical data into dummy