** Introduction Web Scraping ** Web scraping is a technique for converting the data present in unstructured format (HTML tags) over the web to the structured format which can easily be accessed and used. Most of the data available over the web is not readily available. It is present in an unstructured format (HTML format) and is not downloadable. Therefore, it requires knowledge and expertise to use this data.
We can locate useful data based on their CSS selectors, especially when the webpage uses semantic tag attributes. We can use [selectorgadget] (http://selectorgadget.com/) to find out which css selector matches the “review”. SelectGadget can be added an extension in Google chrome. It is shown as a magnifying glass.
pacman::p_load(tidyverse,tidytext,viridis,rvest,tm,wordcloud,SnowballC,tidyquant,ggridges,scales,highcharter,topicmodels)
We can specify the css selector in html_nodes() and extract the text with html_text(). We scrab over 1800 reviews of Fiat Chrysler Automobiles from glassdoor. There are about 155 webpages which contain these reviews.
n=155
#The reviews has 155 pages,thus n=155
FCA_urls <- paste0("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149_P",seq(2, n), ".htm")
FCA_urls<-c("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149.htm",FCA_urls)
FCA_html <- FCA_urls %>%
map_chr(~ read_html(.) %>% html_node(".hreview")%>%html_text())
FCA_html[[1]]
[1] "Featured Review Helpful (1)\"love the company has taught me a lot of new information in my contained growth as a mechanic\"StarStarStarStarStarCurrent Employee - Service Technician in Dayton, OHCurrent Employee - Service Technician in Dayton, OHI have been working at FCA Fiat Chrysler Automobiles full-time (More than 3 years)Prospros are that the company is great, pays for your training that will help in advancement. I get to work on the new cars and technology in the automotive fieldConsno cons as of yet, I love the work I do.and the folk I work with. plan on staying with the company through retirementAdvice to Managementkeep up the good workShare on FacebookShare on TwitterShare on WhatsAppShare via EmailCopy LinkLink Copied!Flag as InappropriateFlag as InappropriateHelpful (1) FCA Fiat Chrysler Automobiles Response seconds ago Edit • Delete FCA Fiat Chrysler Automobiles 2017-09-30 21:14 PDT"
We can remove all unwanted characters at this stage
#Data-Preprocessing: removing '\n'
FCA_html<-gsub("\n","",FCA_html)
#remove all round brackets
FCA_html<-FCA_html%>%str_replace_all("\\(|\\)", "")
#remove all \\
FCA_html<-FCA_html%>%str_replace_all("\\\\", "")
#remove all non words and non numbers
#FCA_html<-FCA_html%>%str_replace_all("[^A-Za-z0-9]", "")
#remove all •
FCA_html<-FCA_html%>%str_replace_all("\\• ", "")
#remove all &
FCA_html<-FCA_html%>%str_replace_all("\\ & ", "")
#remove all non printable words
FCA_html<-FCA_html%>%str_replace_all("[^[:print:]]", "")
#remove all \
FCA_html<-FCA_html%>%str_replace_all(pattern = "\"", replacement = "")
#FCAindeed2<-FCAindeed2%>%stringi::stri_unescape_unicode()
# remove digits
#FCA_html%>%str_replace_all(pattern = "[[:digit:]]+", replacement = "")
#tm::removeNumbers(FCA_html)
#### pattern for dates
pattern ="\\(?\\d{4}\\)?[.-]? *\\d{2}[.-]? *[.-]?\\d{2}"
date=FCA_html%>%str_extract_all(pattern)
#FCA_html[[1]]%>%str_subset(pattern = "([0-9]{1,2})[- .]([a-zA-Z]+)[- .]([0-9]{4})")
#FCA_html[[1]]
#unlist(Date)
Date=as.Date(unlist(date))
#FCA_html_2=data_frame(Date=as.Date(unlist(date)),FCA_html)
get_sentiments(lexicon = "nrc")%>%
count(sentiment, sort = TRUE)
Convert the text data to dataframe.
GlassdoorPages <- data_frame(date=as.Date(unlist(date)),page = seq(1, n),
text = c(FCA_html))%>%arrange(desc(date))
GlassdoorPages%>%head(5)
GlassdoorPages%>%tail(5)
Now we have the letters, and can convert this to a tidy text format.
tidy_FCA <- GlassdoorPages %>%
unnest_tokens(word, text) %>%
add_count(date) %>%
dplyr::rename(date_total = n)
#remove stop words
data("stop_words")
tidy_FCA <- tidy_FCA %>%
anti_join(stop_words)
Remove some other user defined stop words.
stop_user=c("linklink","whatsappshar","auburn","twittershar","edit","delet","via","edit","delet","via","starstarstarstarstarwork","pdt","hill","ago",
"facebookshar")
stop_user2=data_frame(word=stop_user)
tidy_FCA <- tidy_FCA %>%
anti_join(stop_user2)
tidy_FCA%>%head()
Next, let’s implement the sentiment analysis.
FCA_sentiment <- tidy_FCA %>%
inner_join(get_sentiments("nrc"))
FCA_sentiment%>%head()
Now we have all we need to see the relative changes in these sentiments over the years.
theme_set(theme_bw())
#Alternatively
#FCA_sentiment%>%group_by(page, page_total, sentiment)%>%count()
FCA_sentiment %>%
count(date, date_total, sentiment) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness"))%>%
mutate(sentiment = as.factor(sentiment)) %>%
#ggplot(aes(page, n / page_total, fill = sentiment)) +
ggplot(aes(date, n / sum(n), fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Year",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "D")(6))+
scale_y_continuous(labels = scales::percent)
FCA_sentiment %>%
count(date, date_total, sentiment) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness"))%>%
mutate(sentiment = as.factor(sentiment)) %>%
#ggplot(aes(page, n / page_total, fill = sentiment)) +
ggplot(aes(x=date,y= n / sum(n), fill = sentiment,height=n / sum(n),group=sentiment)) +
geom_ridgeline_gradient() +
labs(y = "Relative frequency", x = "Year",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_viridis(discrete = TRUE, direction = -1) +
scale_y_continuous(labels = scales::percent)
#expand x and y limits
#expand_limits(x=c(0,160), y=c(0, 0.0005))
At the beginning 2008 to the end of 2013,the positive sentiments outweigh the negative sentiments. The level of trust in FCA was also higher compared to sad sentiments expressed the reviewers. From 2014, the percentage of positive/trust sentiments does not overwhemly dorminate the negative/sadness sentiments. In general, the positive sentiments marked a decline from the 2014 onwards.
FCA_sentiment %>%
count(date, date_total, sentiment) %>%
# filter(sentiment %in% c("positive", "negative", "joy", "trust","fear","sadness"))%>%
mutate(sentiment = forcats::fct_lump(sentiment, 6))%>%
#mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(date, n / date_total, fill = sentiment)) +
#ggplot(aes(page, n / sum(n), fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Year",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "A")(7))+
scale_y_continuous(labels = scales::percent)
tidy_FCA %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
summarize(average_sentiment = mean(score), words = n()) %>%
#filter(words >= 10) %>%
ggplot(aes(date, average_sentiment)) +
geom_line() +
geom_hline(color = "red", lty = 2, yintercept = 0) +
labs(y = "Average AFINN sentiment score", x = "Year",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the affin lexicon")+
geom_smooth(method = "loess")
There average sentiment was positive at the first half of 2008. From the middle of 2008 to last quarter of 2009, there was a downward trend in the average sentiments expressed by reviewers. This can be attributed to the Global financial crisis which started around that time.The highest positive reviews occurred in october 2015 and April 2016 whereas the lowest negative sentiments were expressed in August 2016.
ldat=tidy_FCA %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
summarize(average_sentiment = mean(score), words = n())
highchart() %>%
hc_title(text = "Sentiment analysis of FCA Glassdoor Reviews")%>%
hc_add_series_times_values(ldat$date, ldat$average_sentiment,
name = "Year",color="#440154FF")%>%
hc_yAxis(title = list(text = "Average AFINN sentiment score"),labels = list(format = "{value}"), max = 4,min=-4,plotLines = list(
list(label = list(text = ""),
color = "#35B779FF",
width = 2,
value = 0)))
highchart(type = "stock") %>%
hc_title(text = "Sentiment analysis of FCA Glassdoor Reviews") %>%
hc_subtitle(text = "") %>%
hc_tooltip(valueDecimals = 2) %>%
hc_add_series_times_values(ldat$date, ldat$average_sentiment,
name = "",color="#440154FF")%>%
hc_add_theme(hc_theme_gridlight())%>%
hc_yAxis(title = list(text = "Average AFINN sentiment score"),labels = list(format = "{value}"), max = 4,min=-4,plotLines = list(
list(label = list(text = ""),
color = "red",
width = 2,
value = 0)))
tidy_FCA %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
summarize(average_sentiment = mean(score), words = n()) %>%
# filter(words >= 5) %>%
ggplot(aes(date, average_sentiment)) +
geom_line( )+
theme_minimal()+
geom_ridgeline_gradient(aes(y=0,height=average_sentiment,fill=average_sentiment),min_height=-3.5)+
scale_fill_viridis(option="C",limit=c(-3.5,4))+
#geom_line() +
geom_hline(color = "red", lty = 2, yintercept = 0) +
labs(y = "Average AFINN sentiment score", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the affin lexicon")
FCA_sentiment %>%
count(sentiment, word) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness")) %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup %>%
mutate(word = reorder(word, n)) %>%
mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(alpha = 0.8, show.legend = FALSE,stat = "identity") +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
facet_wrap(~sentiment, scales = "free") +
labs(y = "Total number of occurrences", x = "",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc lexicon")+theme_bw()+
scale_fill_manual(values=viridis_pal(option = "D")(6))
# # change text into italics
# theme(strip.text = element_text(face = "italic")) +
# # strip horizontal axis labels
# theme(axis.title.x=element_blank()) +
# theme(axis.ticks.x=element_blank()) +
# theme(axis.text.x=element_blank())
FCA_sentiment %>%
count(date, date_total, sentiment) %>%
filter(sentiment %in% c("positive", "negative",
"joy", "trust","fear","sadness"))%>%
mutate(sentiment = factor(sentiment, levels = c("negative",
"positive",
"joy", "trust","fear","sadness"))) %>%
ggplot(aes(date, n / date_total, fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = NULL,
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc")+theme_bw()
FCA_sentiment <- tidy_FCA %>%
inner_join(get_sentiments("bing"))
FCA_sentiment %>%
count(date, date_total, sentiment)%>%
mutate(sentiment = as.factor(sentiment))%>%
ggplot(aes(date, n / date_total, fill = sentiment)) +
geom_area(position = "identity", alpha = 0.5) +
labs(y = "Relative frequency", x = "Page",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the nrc")+theme_bw()+
# scale_fill_manual(values=viridis_pal(option = "plasma")(2))+
scale_y_continuous(labels = scales::percent)
The negative and positive sentiments distribution is similar with the negative sentiments having a higher peak. The negative reviews is evenly distributed as like the positive reviews. Neither is clearly superior over the other.
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)%>%
spread(sentiment,n,fill=0)%>%
mutate(sentiment = positive -negative)%>%
ggplot(aes(x = sentiment)) +
geom_density(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
theme_tq()+xlim(c(-5,5))
den=GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)%>%
spread(sentiment,n,fill=0)%>%
mutate(sentiment = positive -negative)
#hchart(den,density(den$sentiment), type = "area", color = "#B71C1C", name = "Density")
hchart(density(den$sentiment), type = "area", color =viridis_pal()(1), name = "Sentiment")%>%
hc_xAxis(min = -5, max =5)%>%
hc_yAxis(title = list(text = "density"),labels = list(format = "{value}"))
The most common positive and negative words are visualized below.
FCA_sentiment %>%
count(sentiment, word) %>%
group_by(sentiment) %>%
top_n(15) %>%
ungroup %>%
mutate(word = reorder(word, n)) %>%
mutate(sentiment = as.factor(sentiment)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(expand = c(0,0)) +
facet_wrap(~sentiment,scales="free") +
labs(y = "Total number of occurrences", x = "",
title = "Sentiment analysis of FCA Glassdoor Reviews",
subtitle = "Using the bing lexicon")+
#scale_fill_manual(values=viridis_pal(option = "D")(8))+
scale_fill_viridis(end = 0.75, discrete=TRUE, direction = -1) +
scale_x_discrete(expand=c(0.02,0)) +
theme(strip.text=element_text(hjust=0)) +
# change text into italics
theme(strip.text = element_text(face = "italic")) +
# strip horizontal axis labels
theme(axis.title.x=element_blank()) +
theme(axis.ticks.x=element_blank()) +
theme(axis.text.x=element_blank())+
theme_minimal(base_size = 13)
The count most common positive and negative sentiment is displayed graphicaly below.
bing_word_counts <-tidy_FCA %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
bing_word_counts%>%spread(sentiment,n,fill = 0)%>%top_n(10)
bing_word_counts%>%spread(sentiment,n,fill = 0)%>%top_n(-10)%>%head(10)
bing_word_counts %>%
filter(n > 3) %>%
mutate(n = if_else(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(y="Contribution to sentiment",title="bing sentiments")+
#scale_fill_manual(values=viridis_pal(option = "D")(2))
scale_fill_viridis(end = 0.85, discrete=TRUE, direction = 1)
# n=4
#
# FCA=list()
#
# for (i in 2:n){
#
# FCA[[1]]=read_html("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149.htm")%>% html_nodes(".hreview")%>%html_text(trim = TRUE)
#
# FCA[[i]]=read_html(paste("https://www.glassdoor.com/Reviews/FCA-Fiat-Chrysler-Automobiles-Reviews-E149_P",i,".htm",sep = ""))%>% html_nodes(".hreview")%>%html_text(trim = TRUE)
#
# }
#FCA_html2<-FCA_html%>%str_replace_all("[[:xdigit:]]", "")
corpus = Corpus(VectorSource(FCA_html))
corpus = tm_map(corpus, tolower)
corpus<- tm_map(corpus, stripWhitespace)
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, removeWords,stop_user )
tdm <- TermDocumentMatrix(corpus,
control = list(removePunctuation = TRUE,
stopwords = TRUE,
removeNumbers = TRUE, tolower = TRUE,
PlainTextDocument=TRUE,
stripWhitespace=TRUE, stemming = TRUE))
inspect(tdm)
<<TermDocumentMatrix (terms: 2165, documents: 155)>>
Non-/sparse entries: 9569/326006
Sparsity : 97%
Maximal term length: 38
Weighting : term frequency (tf)
Sample :
Docs
Terms 113 115 116 118 145 27 28 33 43 84
automobil 2 2 2 2 3 3 3 2 2 3
chrysler 5 7 3 2 3 3 3 2 2 3
copi 1 1 1 1 1 1 1 1 1 1
employe 3 3 2 2 5 4 9 4 6 2
fca 2 2 2 2 3 3 6 4 3 3
fiat 3 2 2 2 5 3 3 2 2 3
life 1 1 3 1 1 2 3 1 1 1
respons 1 1 1 1 2 1 1 1 1 1
second 1 1 1 1 1 2 1 1 1 1
work 4 1 2 1 8 8 7 0 3 3
tidy(tdm)
tdm = as.matrix(tdm)
frequencies = DocumentTermMatrix(corpus)
frequencies
<<DocumentTermMatrix (documents: 155, terms: 2653)>>
Non-/sparse entries: 9847/401368
Sparsity : 98%
Maximal term length: 38
Weighting : term frequency (tf)
findFreqTerms(frequencies, lowfreq=100)
[1] "automobiles" "chrysler"
[3] "copied" "delete"
[5] "emailcopy" "employee"
[7] "facebookshare" "fca"
[9] "fiat" "flag"
[11] "inappropriateflag" "inappropriatehelpful"
[13] "response" "seconds"
[15] "time" "twittershare"
[17] "whatsappshare" "work"
[19] "anonymous" "balanceculturevaluescareer"
[21] "life" "opportunitiescompbenefitssenior"
Remove sparse terms
sparse = removeSparseTerms(frequencies, 0.995)
sparse
<<DocumentTermMatrix (documents: 155, terms: 2653)>>
Non-/sparse entries: 9847/401368
Sparsity : 98%
Maximal term length: 38
Weighting : term frequency (tf)
What about associations between words? Let’s have a look at what other words had a high association with “love”.
findAssocs(frequencies, c("love","poor","flexible","horrible"), c(0.6,0.6,0.6,0.6))
$love
dayton featured
0.82 0.82
fieldconsno mechanicstarstarstarstarstarcurrent
0.82 0.82
ohi retirementadvice
0.82 0.82
staying taught
0.82 0.82
yearsprospros
0.82
$poor
ctc continue advances
0.75 0.75 0.71
allowed cheapest choice
0.71 0.71 0.71
clue creation currect
0.71 0.71 0.71
decides developed drained
0.71 0.71 0.71
energy entire ethics
0.71 0.71 0.71
executed exited fantastic
0.71 0.71 0.71
feature final flounder
0.71 0.71 0.71
focused goals gouge
0.71 0.71 0.71
hair hammers highlighted
0.71 0.71 0.71
hurt incorporate interest
0.71 0.71 0.71
items location merger
0.71 0.71 0.71
minimal mirrors outstanding
0.71 0.71 0.71
participate partners price
0.71 0.71 0.71
questionable reduce requested
0.71 0.71 0.71
roof shown sighted
0.71 0.71 0.71
sink site slogans
0.71 0.71 0.71
smoke stimulate swimstarstarstarstarstarwork
0.71 0.71 0.71
talked tco viability
0.71 0.71 0.71
view warranty washington
0.71 0.71 0.71
words recent since
0.71 0.63 0.63
seen
0.60
$flexible
numeric(0)
$horrible
bump consvacation
0.70 0.70
designers designerstarstarstarstarstarwork
0.70 0.70
earn impossible
0.70 0.70
members possibility
0.70 0.70
yearsproshealth affect
0.70 0.70
affects anecdotal
0.70 0.70
assertions astounding
0.70 0.70
badge blow
0.70 0.70
brought century
0.70 0.70
ceoprosalthough channel
0.70 0.70
chasm circumstances
0.70 0.70
coduit colonels
0.70 0.70
comradery concept
0.70 0.70
countless disaster
0.70 0.70
eliminate evidence
0.70 0.70
exceptional exhaustive
0.70 0.70
exist fatal
0.70 0.70
fine giant
0.70 0.70
grant granting
0.70 0.70
guns habits
0.70 0.70
intellect majors
0.70 0.70
managementwhen maybe
0.70 0.70
mechanism minded
0.70 0.70
night nonetheless
0.70 0.70
nowhere observation
0.70 0.70
physics preventing
0.70 0.70
procedures proof
0.70 0.70
recipe regard
0.70 0.70
roots separating
0.70 0.70
sharp shooting
0.70 0.70
soley special
0.70 0.70
statements telling
0.70 0.70
testing tops
0.70 0.70
trenches trumps
0.70 0.70
ugly unfortunately
0.70 0.70
war weeds
0.70 0.70
whole winning
0.70 0.70
wiped wonderful
0.70 0.70
however now
0.63 0.61
The most commonly words used in the reviews is plotted below.
The most common positive and negative words are graphically depicted below.
c("edit","delet","via","starstarstarstarstarwork","pdt","hill","facebookshar")
[1] "edit" "delet" "via"
[4] "starstarstarstarstarwork" "pdt" "hill"
[7] "facebookshar"
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = viridis_pal(option = "D")(2),
max.words = 100)
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE)%>%
spread( word,n,fill = 0)%>%head(5)
Among some of the words commonly used by reviewers to express positive,negative,joy or sadness is displayed in the word cloud below.
GlassdoorPages %>%
unnest_tokens(word, text)%>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE)%>%
filter(sentiment %in% c("negative","positive","joy","sadness"))%>%
reshape2::acast(word ~ sentiment, value.var = "n", fill = 0)%>%
comparison.cloud(colors = viridis_pal(option = "D")(4),
max.words = 200)
library(igraph)
library(ggraph)
tidy_descr_ngrams=GlassdoorPages %>%
unnest_tokens(word, text, token = "ngrams", n = 2) %>%
separate(word, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)%>%
filter(!word1 %in% stop_user) %>%
filter(!word2 %in% stop_user)%>%
mutate(word1 = removeNumbers(word1))%>%
mutate(word2 = removeNumbers(word2))
bigram_counts=tidy_descr_ngrams %>%
count(word1, word2, sort = TRUE)
bigram_graph =bigram_counts %>%
filter(n > 10) %>%
graph_from_data_frame()
set.seed(1)
a=grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = palette_light()[1], size = 5, alpha = 0.8) +
geom_node_text(aes(label = name), vjust = 1, hjust = 0.5) +
theme_void()
The most common word is employee which suggest majority of the reviewers were either employees or ex-employees.The rest are related to management and the work environment.
data(stop_words)
tidy_descr<-GlassdoorPages %>%
unnest_tokens(word, text) %>%
mutate(word=removeNumbers(word))%>%
mutate(word_stem = wordStem(word)) %>%
anti_join(stop_words, by = "word") %>%
filter(!word_stem %in% stop_words$word) %>%
filter(!word_stem %in% stop_user)
tidy_descr %>%
count(word_stem, sort = TRUE) %>%
filter(n > 30) %>%
ggplot(aes(x = reorder(word_stem, n), y = n)) +
geom_col(color = palette_light()[1], fill = palette_light()[1], alpha = 0.8) +
coord_flip() +
theme_tq() +
labs(x = "",
y = "count of most common words",titlt="Count of most common words")
tidy_descr %>%
count(word_stem) %>%
mutate(word_stem = removeNumbers(word_stem)) %>%
with(wordcloud(word_stem, n, max.words = 100, colors = palette_light()))
bigram_counts %>%
mutate(word1 = removeNumbers(word1))%>%
mutate(word2 = removeNumbers(word2))%>%
filter(n > 20) %>%
ggplot(aes(x = reorder(word1,-n), y = reorder(word2,-n), fill = n)) +
geom_tile(alpha = 0.8, color = "white") +
scale_fill_gradientn(colours = c(palette_light()[[1]], palette_light()[[2]])) +
coord_flip() +
theme_tq() +
theme(legend.position = "right") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(x = "first word in pair",
y = "second word in pair")
tidy_descr_ngrams=GlassdoorPages %>%
unnest_tokens(word, text, token = "ngrams", n = 2)%>%
separate(word, c("word1", "word2"), sep = " ")%>%
mutate(word1=removeNumbers(word1))%>%
mutate(word1 = wordStem(word1))%>%
mutate(word2=removeNumbers(word2))%>%
mutate(word2 = wordStem(word2))
tidy_descr_ngrams
tidy_FCA %>%
inner_join(get_sentiments("bing"))%>%
group_by(sentiment)%>%count()%>%
ggplot(aes(x = reorder(sentiment, n), y = n,fill=palette_light()[1])) +
geom_col( alpha = 0.8,width = 0.5) +
coord_flip() +
theme_tq()+
labs(y="sentiments ",title="bing lexicon sentiment count" ,x="frequency")+
theme(legend.position="none")+
#scale_fill_viridis(end = 0.85, discrete=TRUE, direction = 1,option = "D")
#scale_fill_manual(values=viridis_pal(option = "A")(2))
scale_fill_tq()
tidy_FCA %>%
inner_join(get_sentiments("nrc"))%>%
group_by(sentiment)%>%count()%>%
ggplot(aes(x = reorder(sentiment, n), y = n,fill=palette_light()[1])) +
geom_col( alpha = 0.8) +
coord_flip() +
theme_tq()+
labs(y="sentiments ",title="nrc lexicon sentiment count" ,x="frequency")+
theme(legend.position="none")+
#scale_fill_viridis(end = 0.85, discrete=TRUE, direction = 1,option = "D")
#scale_fill_manual(values=viridis_pal(option = "A")(2))
scale_fill_tq()
Topic modeling is a method for unsupervised classification of documents, by modeling each document as a mixture of topics and each topic as a mixture of words. Latent Dirichlet allocation(LDA) is a particularly popular method for fitting a topic model.
We can investigate what are the top five topics in the reviews by topic modelling.
dtm_words_count<-tidy_descr %>%
mutate(word_stem = removeNumbers(word_stem)) %>%
count(date, word_stem, sort = TRUE) %>%
ungroup() %>%
filter(word_stem != "") %>%
# Casting a data frame to a DocumentTermMatrix, TermDocumentMatrix, or dfm
cast_dtm(date, word_stem, n)
# set a seed so that the output of the model is predictable
dtm_lda<-LDA(dtm_words_count, k = 5, control = list(seed = 1234))
topics_beta<-tidy(dtm_lda, matrix = "beta")
p1<-topics_beta %>%
filter(grepl("[a‐z]+", term)) %>% # extract alphabets a-z
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta) %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, color = factor(topic), fill = factor(topic))) +
geom_col(show.legend = FALSE, alpha = 0.8) +
scale_color_manual(values = palette_light()) +
scale_fill_manual(values = palette_light()) +
facet_wrap(~ topic, ncol = 5) +
coord_flip() +
theme_tq() +
labs(x = "",
y = "beta (~ occurrence in topics 1‐5)",
title = "The top 10 most characteristic words describe topic categories.")
user_topic<-tidy(dtm_lda, matrix = "gamma") %>%
arrange(desc(gamma)) %>%
group_by(document) %>%
top_n(1, gamma)
p2<-user_topic %>%
group_by(topic) %>%
top_n(10, gamma) %>%
ggplot(aes(x = reorder(document, -gamma), y = gamma, color = factor(topic))) +
facet_wrap(~ topic, scales = "free", ncol = 5) +
geom_point(show.legend = FALSE, size = 4, alpha = 0.8) +
scale_color_manual(values = palette_light()) +
scale_fill_manual(values = palette_light()) +
theme_tq() +
coord_flip() +
labs(x = "",
y = "gamma\n(~ affiliation with topics 1‐5)")
library(grid)
library(gridExtra)
#grid.arrange(p1, p2, ncol = 1, heights = c(0.7, 0.3))
p1
p2
The dataframe can be cast into a document-term matrix (one-term-per-document-per-row) with the cast_dtm function in tidytext.The LDA function accepts input of this format.
date_dtm<-GlassdoorPages %>%
mutate(year=year(date))%>%
unnest_tokens(word, text) %>%
anti_join(stop_words)%>%
anti_join(stop_user2) %>%
filter(!word %in% stop_user2) %>%
mutate(word=removeNumbers(word))%>%
mutate(word = wordStem(word))%>%
filter(word != "") %>%
count(year, word, sort = TRUE) %>%
ungroup()%>%
cast_dtm(year, word, n)
We can now use topicmodels package to create a five topic LDA model.
date_lda <- LDA(date_dtm, k = 5, control = list(seed = 1234))
date_lda_td <- tidy(date_lda)
date_lda_td%>%head()
The β represent the probability that each term on a row belongs the topic on the row.
top_terms <- date_lda_td %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
filter(grepl("[a‐z]+", term)) %>% # extract alphabets a-z
arrange(topic, -beta)
top_terms%>%head()
top_terms %>%mutate(topic=as.factor(topic))%>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_bar(stat = "identity",show.legend = FALSE, alpha = 0.8) +
facet_wrap(~ topic, scales = "free") +
theme(axis.text.x = element_text(size = 15, angle = 90, hjust = 1))+
coord_flip()+
scale_fill_viridis(end = 0.75, discrete=TRUE, direction = -1,option = "D")+
labs(x = "",
y = "beta (occurrence in topics 1-5)",
title = "The top 10 most characteristic words describe topic categories.")
We treat each year beginning from 2008 as a separate document. Setting matrix = “gamma” returns a tidied version with one-document-per-topic-per-row. Now that we have these document classifications, we can see how well our unsupervised learning did at distinguishing the five topics. First we re-separate the document name into title and chapter:
date_lda_gamma <- tidy(date_lda, matrix = "gamma")%>%mutate(topic=factor(topic))
date_lda_gamma
ggplot(date_lda_gamma, aes(gamma, fill = topic)) +
geom_histogram(bins = 30,binwidth=0.25) +
facet_wrap(~ document, nrow = 2)+
scale_fill_viridis(end = 0.75, discrete=TRUE, direction = -1)+theme_tq()