Import the file csv
# train data set
data1<-read.csv("D:/1 Teaching Material/R/case studies/Sentiment/Problem data set/train.csv", header = TRUE, sep = "~", stringsAsFactors = FALSE)
str(data1)
## 'data.frame': 30172 obs. of 5 variables:
## $ User_ID : int 11755 33912 10143 33114 17464 34367 14524 35130 1004 27086 ...
## $ Description : chr "After reading mixed reviews I almost didn't book at the W, but I was attending a concert at the Pantages Theatr"| __truncated__ "This motor inn is located about - city blocks from Fisherman's Warf and about - blocks from the Embarcadero. Yo"| __truncated__ "It was our first time there and surely not our last. Arrived very early off the train and went there to drop of"| __truncated__ "Great hotel in an excellent location, just off Times Square and next door to the Nederlander Theatre (home of t"| __truncated__ ...
## $ Browser_Used: chr "Google Chrome" "Firefox" "Google Chrome" "Mozilla" ...
## $ Device_Used : chr "Desktop" "Tablet" "Mobile" "Desktop" ...
## $ Is_Response : chr "Good" "Good" "Good" "Good" ...
#----------------------------------------
# test data set
tdata1<-read.csv("D:/1 Teaching Material/R/case studies/Sentiment/Problem data set/test.csv", header = TRUE, sep = "~", stringsAsFactors = FALSE)
str(tdata1)
## 'data.frame': 8760 obs. of 4 variables:
## $ User_ID : int 9602 8749 15500 5495 18570 35280 23513 15477 6505 34730 ...
## $ Description : chr "A friend and I stayed in this hotel when we were in Jacksonville for a concert. I always agonize over choosing "| __truncated__ "I enjoy staying here when I have early flights. Service is good, clean, easy in and out, imcludimg separate par"| __truncated__ "I stopped off in Seattle during a train tour of the west and had such a great experience. When I arrived in my "| __truncated__ "I have stayed at this hotel - or - times now for business travel and plan to again in a few weeks. Always very "| __truncated__ ...
## $ Browser_Used: chr "Edge" "Google Chrome" "Chrome" "Mozilla Firefox" ...
## $ Device_Used : chr "Desktop" "Mobile" "Mobile" "Desktop" ...
#str(data1)
#summary(data1)
# head(data1$Description)
# hidden line breaks to be removed from the data
data1$Description <- sapply(data1$Description,
function(x)
{ gsub("[\r\n]", "", x)
}
)
#str(data1$Description)
#-----------------------------#
tdata1$Description <- sapply(tdata1$Description,
function(x)
{ gsub("[\r\n]", "", x)
}
)
#str(tdata1$Description)
Sentiment analysis
library(sentimentr)
library(lexicon)
##
## Attaching package: 'lexicon'
## The following object is masked from 'package:sentimentr':
##
## available_data
library(syuzhet)
##
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
##
## get_sentences
h1<-sentiment_by(data1$Description)
## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
# str(h1)
#------------------
th1<-sentiment_by(tdata1$Description)
## Warning: Each time `sentiment_by` is run it has to do sentence boundary disambiguation when a
## raw `character` vector is passed to `text.var`. This may be costly of time and
## memory. It is highly recommended that the user first runs the raw `character`
## vector through the `get_sentences` function.
# str(th1)
h2<-get_sentiment(data1$Description, method="syuzhet")
#str(h2)
#summary(h2)
#---------------------------
th2<-get_sentiment(tdata1$Description, method="syuzhet")
#str(th2)
#summary(th2)
h3<-get_sentiment(data1$Description, method="bing")
#str(h1)
#-------------
th3<-get_sentiment(tdata1$Description, method="bing")
#str(th1)
h4<-get_sentiment(data1$Description, method="afinn")
#str(h1)
#--------------------
th4<-get_sentiment(tdata1$Description, method="afinn")
#str(th1)
h5<-get_sentiment(data1$Description, method="nrc", lang = "english")
#str(h5)
#-------------------------
th5<-get_sentiment(tdata1$Description, method="nrc", lang = "english")
#str(th5)
combining the sentiments of all the methods and taking average of it
avg_sent<-cbind(h1$ave_sentiment,h2, h3,h4,h5)
#-------------------------------
tavg_sent<-cbind(th1$ave_sentiment,th2, th3,th4,th5)
since all the different methods of sentients have different scale convert them to standard one, -1, 0, 1 using sign function
h11<-sign(h1$ave_sentiment)
h21<-sign(h2)
h31<-sign(h3)
h41<-sign(h4)
h51<-sign(h5)
#--------------------------
th11<-sign(th1$ave_sentiment)
th21<-sign(th2)
th31<-sign(th3)
th41<-sign(th4)
th51<-sign(th5)
combine the sentiments
avg_sent<-cbind(h11,h21, h31,h41,h51)
#str(avg_sent)
#dim(avg_sent)
#--------------------------
tavg_sent<-cbind(th11,th21, th31,th41,th51)
#str(tavg_sent)
#dim(tavg_sent)
calculate the average and sum of the sentiment of h1 to h5
avg<-rowMeans(avg_sent[,1:5])
#str(avg)
#---------------------
tavg<-rowMeans(tavg_sent[,1:5])
#str(tavg)
# convert the average sentiment into categorical variable as positive sentiment, negative sentiment and neutral
sent1<-sign(avg)
sent1<-as.factor(sent1)
#-----------------------
tsent1<-sign(tavg)
tsent1<-as.factor(tsent1)
# label the sentiment
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:rpart':
##
## solder
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
sent1<-factor(sent1, levels = c(-1,0,1), labels = c("Negative", "Neutral", "Positive"))
#str(sent1)
#---------------------------
tsent1<-factor(tsent1, levels = c(-1,0,1), labels = c("Negative", "Neutral", "Positive"))
#str(tsent1)
final data set for analysis
data2<-cbind.data.frame(data1,sent1)
#str(data2)
#--------------------
tdata2<-cbind.data.frame(tdata1,tsent1)
#str(tdata2)
# remove data which is not required for model building
data3<-data2[,-2]
#str(data3)
#head(data3)
#----------------------------
tdata3<-tdata2[,-2]
#str(tdata3)
#head(tdata3)
#str(tdata3)
# convert the data into factor
data3$Browser_Used<-as.factor(data3$Browser_Used)
data3$Device_Used<-as.factor(data3$Device_Used)
data3$Is_Response<-as.factor(data3$Is_Response)
#-------------------------------------
tdata3$Browser_Used<-as.factor(tdata3$Browser_Used)
tdata3$Device_Used<-as.factor(tdata3$Device_Used)
Analysis of the data
# Descriptive statistics
#summary(data3)
#table(data3$Browser_Used)
# problem with categories, few categories had to be combined
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# recode the data for similar browser names
data3$Browser_Used<-recode(data3$Browser_Used, Chrome = "Google Chrome")
data3$Browser_Used<-recode(data3$Browser_Used, InternetExplorer = "Internet Explorer")
data3$Browser_Used<-recode(data3$Browser_Used, Mozilla = "Mozilla Firefox")
data3$Browser_Used<-recode(data3$Browser_Used, IE = "Internet Explorer")
table(data3$Browser_Used)
##
## Google Chrome Edge Firefox Internet Explorer
## 5478 5530 5754 7093
## Mozilla Firefox Opera Safari
## 5737 274 306
table(data3$Device_Used)
##
## Desktop Mobile Tablet
## 11630 11639 6903
table(data3$Is_Response)
##
## Bad Good
## 9605 20567
table(data3$sent1)
##
## Negative Neutral Positive
## 2436 353 27383
#-----------------
tdata3$Browser_Used<-recode(tdata3$Browser_Used, Chrome = "Google Chrome")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, InternetExplorer = "Internet Explorer")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, Mozilla = "Mozilla Firefox")
tdata3$Browser_Used<-recode(tdata3$Browser_Used, IE = "Internet Explorer")
table(tdata3$Browser_Used)
##
## Google Chrome Edge Firefox Internet Explorer
## 1651 1604 1613 2037
## Mozilla Firefox Opera Safari
## 1683 88 84
table(tdata3$Device_Used)
##
## Desktop Mobile Tablet
## 3396 3337 2027
table(tdata3$tsent1)
##
## Negative Neutral Positive
## 725 115 7920
Comparing the response variables with the other variables using table and plots
tab1<-table(data3$Is_Response, data3$Device_Used)
tab1
##
## Desktop Mobile Tablet
## Bad 3417 3418 2770
## Good 8213 8221 4133
chisq.test(tab1)
##
## Pearson's Chi-squared test
##
## data: tab1
## X-squared = 283.7, df = 2, p-value < 2.2e-16
ggplot(data3, aes(Device_Used, fill= Is_Response))+geom_bar()
ggplot(data3, aes(Device_Used, fill= Is_Response))+geom_bar(position = "dodge")
tab2<-table(data3$Is_Response, data3$Browser_Used)
tab2
##
## Google Chrome Edge Firefox Internet Explorer Mozilla Firefox Opera
## Bad 755 3030 2881 949 1815 92
## Good 4723 2500 2873 6144 3922 182
##
## Safari
## Bad 83
## Good 223
chisq.test(tab2)
##
## Pearson's Chi-squared test
##
## data: tab2
## X-squared = 4164.4, df = 6, p-value < 2.2e-16
ggplot(data3, aes(Browser_Used, fill= Is_Response))+geom_bar()
ggplot(data3, aes(Browser_Used, fill= Is_Response))+geom_bar(position = "dodge")
tab3<-table(data3$Is_Response, data3$sent1)
tab3
##
## Negative Neutral Positive
## Bad 2315 299 6991
## Good 121 54 20392
chisq.test(tab3)
##
## Pearson's Chi-squared test
##
## data: tab3
## X-squared = 5439.8, df = 2, p-value < 2.2e-16
ggplot(data3, aes(sent1, fill= Is_Response))+geom_bar()
ggplot(data3, aes(sent1, fill= Is_Response))+geom_bar(position = "dodge")
Applying Model
Decision tree
library(rpart)
library(rpart.plot)
library(caret)
# decision tree
trctrl <- trainControl(method = "cv", number = 10)
dtm1<-train(Is_Response~Browser_Used+Device_Used+sent1,
data = data3,
method = "rpart",
parms = list(split = "information"),
trControl=trctrl,
tuneLength = 10)
dtm1
## CART
##
## 30172 samples
## 3 predictor
## 2 classes: 'Bad', 'Good'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 27156, 27156, 27154, 27154, 27156, 27155, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00000000 0.7680292 0.3890201
## 0.02821447 0.7624948 0.3250137
## 0.05642894 0.7624948 0.3250137
## 0.08464341 0.7624948 0.3250137
## 0.11285789 0.7624948 0.3250137
## 0.14107236 0.7624948 0.3250137
## 0.16928683 0.7624948 0.3250137
## 0.19750130 0.7624948 0.3250137
## 0.22571577 0.7624948 0.3250137
## 0.25393024 0.7195741 0.1539621
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
# using the train data set predict the model
#str(tdata3)
# since the variable sent1 is named tsent1 we have to change it
tdata3$sent1<-tdata3$tsent1
# remove the extra variable,
tdata3<-tdata3[,-4]
#str(tdata3)
dtm1_predict<-predict(dtm1, tdata3)
#dtm1_predict
# create a new data set with predicted values for test data set
tdatapredicted<-tdata3
tdatapredicted$responce<-dtm1_predict
tdatapredicted<-tdatapredicted[,-c(2:4)]
#str(tdatapredicted)
# store the output in a file
write.table(tdatapredicted, "D:/1 Teaching Material/R/case studies/Sentiment/solution/output_decision_tree.csv", row.names = TRUE, sep = "~")
library(caret)
d1<-dummyVars(~Browser_Used+Device_Used+sent1, data = data3)
#predict the values of dummy from the data
dummies <- predict(d1, newdata = data3)
# add dummy to the actual data
data4<-cbind(data3,dummies)
#str(data4)
# remove the browser, device and sent1 variable from the data set
data5<-data4[,-c(2,3,5)]
#str(data5)
#-----------------------------------------------
d2<-dummyVars(~Browser_Used+Device_Used+sent1, data = tdata3)
#predict the values of dummy from the data
dummies2 <- predict(d2, newdata = tdata3)
# add dummy to the actua data
tdata4<-cbind(tdata3,dummies2)
#str(tdata4)
# remove the browser, device and sent1
tdata5<-tdata4[,-c(2,3,4)]
#str(tdata5)
#str(data5)
Support Vector MAchine
library(caret)
trctrl <- trainControl(method = "cv", number = 5)
dtm2<- train(Is_Response~., data = data5, method = "svmLinear",
trControl=trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
dtm2
## Support Vector Machines with Linear Kernel
##
## 30172 samples
## 14 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (14), scaled (14)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 24138, 24137, 24138, 24137, 24138
## Resampling results:
##
## Accuracy Kappa
## 0.7624949 0.3250868
##
## Tuning parameter 'C' was held constant at a value of 1
# predict the values
dtm2_predict<-predict(dtm2,tdata5)
# create a new data set with predicted values for test data set
tdatapredicted2<-tdata3
tdatapredicted2$responce<-dtm2_predict
str(tdatapredicted2)
## 'data.frame': 8760 obs. of 5 variables:
## $ User_ID : int 9602 8749 15500 5495 18570 35280 23513 15477 6505 34730 ...
## $ Browser_Used: Factor w/ 7 levels "Google Chrome",..: 2 1 1 5 2 4 2 3 3 1 ...
## $ Device_Used : Factor w/ 3 levels "Desktop","Mobile",..: 1 2 2 1 2 1 3 3 2 3 ...
## $ sent1 : Factor w/ 3 levels "Negative","Neutral",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ responce : Factor w/ 2 levels "Bad","Good": 2 2 2 2 2 2 2 2 2 2 ...
tdatapredicted2<-tdatapredicted2[,-c(2:4)]
# save the output
write.table(tdatapredicted2, "file:///D:/1 Teaching Material/R/case studies/Sentiment/solution/output_SVM.csv", row.names = TRUE, sep = "~")
KNN
fitControl = trainControl(method="cv", number = 10)
# apply the knn on the whole data
knnMod2 = train(Is_Response~., data=data5,
method="knn",
trControl=fitControl,
preProcess=c("center","scale"),
tuneLength=10)
print(knnMod2)
## k-Nearest Neighbors
##
## 30172 samples
## 14 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (14), scaled (14)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 27155, 27155, 27155, 27155, 27154, 27155, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7373730 0.3551037
## 7 0.7431727 0.3621451
## 9 0.7466528 0.3645643
## 11 0.7505305 0.3716171
## 13 0.7517572 0.3724535
## 15 0.7548062 0.3778595
## 17 0.7547071 0.3756271
## 19 0.7560990 0.3775276
## 21 0.7577893 0.3802678
## 23 0.7560989 0.3732679
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 21.
plot(knnMod2)
# predict the values
knnMod2_predict<-predict(knnMod2,tdata5)
#create a new data set with predicted values for test data set
tdatapredicted3<-tdata3
tdatapredicted3$response<-knnMod2_predict
#str(tdatapredicted3)
tdatapredicted3<-tdatapredicted3[,-c(2:4)]
write.table(tdatapredicted3, "file:///D:/1 Teaching Material/R/case studies/Sentiment/solution/output_knn.csv", row.names = TRUE, sep = "~")