Text Mining for extracting sentiment from a Health Website

Load the data

library(readxl)
data<- read_excel("websitedata.xlsx",.name_repair = "minimal")

Univariate Analysis

# Seperation based on Gender of Doctors

data$Gender<- factor(data$Gender)

library(ggplot2)
ggplot(data = data, mapping = aes(x =Gender)) +
        geom_bar(color="black",fill=c("orange","thistle"))+
        ggtitle("Visualising Distibution of Gender on Website")+
        xlab("Gender")+
        theme_bw()

# almost equal seperation of doctors based on Gender

Cleaning Text in R

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

# creating one large piece of text 
review_text<- paste(data$Reviews, collapse = " ")

# Building a Corpus 
corpus <- iconv(data$Reviews, to = "utf-8-mac")
corpus <- Corpus(VectorSource(corpus))

# cleaning the data 
corpus <- tm_map(corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus <- tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

corpus <- tm_map(corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

corpus <- tm_map(corpus, removeWords,stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(corpus, content_transformer(removeURL))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeURL)):
## transformation drops documents

# stemming the words for further evaluation 
corpus <- tm_map(corpus, stemDocument)

## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents

# creating a document term Matrix 
dtm <- DocumentTermMatrix(cleanset)
dtm2 <- as.matrix(dtm)

Analyzing most used words on the website

# finding the frequency of the most written word by reviewers on the website 
frequency <- colSums(dtm2)
str(frequency)

##  Named num [1:3514] 4 48 3 18 115 357 128 1 29 60 ...
##  - attr(*, "names")= chr [1:3514] "absolute" "absolutely" "accommodating" "advice" ...

frequency <- sort(frequency, decreasing=TRUE)
head(frequency, n=10)

##       doctor         time       always        great         kids         care 
##          463          407          357          266          259          257 
##     children pediatrician        years         love 
##          252          239          231          215

# Creating a wordcloud of top 100 words used 
library(wordcloud)

## Loading required package: RColorBrewer

words1 <- names(frequency)
wordcloud(words1[1:100], 
          frequency[1:100],
          max.words = 150,
          random.order = F,
          min.freq = 5,
          colors = brewer.pal(8, 'Dark2'),
          scale = c(4, 0.3),
          rot.per = 0.3
)

Sentiment Analysis

# Sentiment analysis
library(syuzhet)
library(tm)
library(SnowballC)

review <- iconv(data$Reviews, to = 'utf-8-mac')

# Obtain sentiment scores
s <- get_nrc_sentiment(review)

head(s)

get_nrc_sentiment('delay')

# Bar plot
barplot(colSums(s),
        las = 2,
        col = terrain.colors(6),
        ylab = 'Count',
        ylim=c(0,7000),
        main = 'Sentiment Scores for Reviews')

# we can thus see that overall, there is a positive sentiment on the website for Doctors with trust being the factor which led to the sentiment