Load the data
library(readxl)
data<- read_excel("websitedata.xlsx",.name_repair = "minimal")
Univariate Analysis
# Seperation based on Gender of Doctors
data$Gender<- factor(data$Gender)
library(ggplot2)
ggplot(data = data, mapping = aes(x =Gender)) +
geom_bar(color="black",fill=c("orange","thistle"))+
ggtitle("Visualising Distibution of Gender on Website")+
xlab("Gender")+
theme_bw()

# almost equal seperation of doctors based on Gender
Cleaning Text in R
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# creating one large piece of text
review_text<- paste(data$Reviews, collapse = " ")
# Building a Corpus
corpus <- iconv(data$Reviews, to = "utf-8-mac")
corpus <- Corpus(VectorSource(corpus))
# cleaning the data
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
corpus <- tm_map(corpus, removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
cleanset <- tm_map(corpus, content_transformer(removeURL))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(removeURL)):
## transformation drops documents
# stemming the words for further evaluation
corpus <- tm_map(corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
# creating a document term Matrix
dtm <- DocumentTermMatrix(cleanset)
dtm2 <- as.matrix(dtm)
Analyzing most used words on the website
# finding the frequency of the most written word by reviewers on the website
frequency <- colSums(dtm2)
str(frequency)
## Named num [1:3514] 4 48 3 18 115 357 128 1 29 60 ...
## - attr(*, "names")= chr [1:3514] "absolute" "absolutely" "accommodating" "advice" ...
frequency <- sort(frequency, decreasing=TRUE)
head(frequency, n=10)
## doctor time always great kids care
## 463 407 357 266 259 257
## children pediatrician years love
## 252 239 231 215
# Creating a wordcloud of top 100 words used
library(wordcloud)
## Loading required package: RColorBrewer
words1 <- names(frequency)
wordcloud(words1[1:100],
frequency[1:100],
max.words = 150,
random.order = F,
min.freq = 5,
colors = brewer.pal(8, 'Dark2'),
scale = c(4, 0.3),
rot.per = 0.3
)

Sentiment Analysis
# Sentiment analysis
library(syuzhet)
library(tm)
library(SnowballC)
review <- iconv(data$Reviews, to = 'utf-8-mac')
# Obtain sentiment scores
s <- get_nrc_sentiment(review)
head(s)
get_nrc_sentiment('delay')
# Bar plot
barplot(colSums(s),
las = 2,
col = terrain.colors(6),
ylab = 'Count',
ylim=c(0,7000),
main = 'Sentiment Scores for Reviews')

# we can thus see that overall, there is a positive sentiment on the website for Doctors with trust being the factor which led to the sentiment