library(scales)
library(corrplot)
library(plyr)
library(tidyr)
library(dplyr)
library(stringr)
library(tm)
library(tidytext)
library(stopwords)
library(textstem)
library(readxl)
library(textdata)
library(ggpubr)
library(wordcloud)
library(topicmodels)
library(ggplot2)
library(textrank)
library(udpipe)
library(lattice)
library(igraph)
library(ggraph)
library(gofastr)
library(quanteda)
library(readtext)
library(devtools)
library(spacyr)
# devtools::install_github("quanteda/quanteda.corpora")
library(quanteda.corpora)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(seededlda)
library(lubridate)
library(stm)
library(Rtsne)
library(rsvd)
library(geometry)
library(tidyverse)
library(stringi)
library(ldatuning)
library(knitr)
glass <- read_excel("/Users/liza/Desktop/ДИПЛОМ/Glassdoor reviews 2.xlsx")
jobs <- read_excel("/Users/liza/Desktop/ДИПЛОМ/jobs.xlsx", sheet = 6)
glass <- full_join(x = glass, y = jobs, by = "JobTitle")
glass <- unique(glass)
na <- which(is.na(glass$Company))
glass <- glass[-na, ]
glass$Company <- ifelse(glass$Company == "Mail.Ru", "VK", glass$Company)
# As the name of Mail.Ru is now VK, the name was changed so it was correct.
glass$Company <- as.factor(glass$Company)
glass$Date <- dmy(glass$Date) # Now the variable is of a type date in the format dmy (date-month-year)
glass$EmployeeStatus <- as.factor(glass$EmployeeStatus)
glass$EmployeeType <- as.factor(glass$EmployeeType)
glass$WorkExperience <- ifelse(glass$WorkExperience == "ess than 1 year", "less than 1 year", glass$WorkExperience)
# In some rows the letter "l" was missing in the phrase "less than 1 year", so that was fixed.
glass$WorkExperience <- as.factor(glass$WorkExperience)
glass$Category <- ifelse(glass$Category == "Anonymous", NA, glass$Category)
# There were reviewers who did not indicated their job titles, so their category was "Anonymous". It was changed to NA so they did not form a separate group.
glass$Category <- as.factor(glass$Category)
glass$ID <- as.factor(glass$ID)
glass$JobTitle <- as.factor(glass$JobTitle)
glass$JobTitle <- fct_collapse(glass$JobTitle,
"Anonymous" = c("Anonymous Employee", "Anonymous", "Anonymous Contractor",
"Anonymous Freelancer", "Anonymous Intern"))
levels(glass$JobTitle)[levels(glass$JobTitle) == "Anonymous"] <- NA
# The same thing was done for the variable with job titles as with "Category".
glass$Location <- as.factor(glass$Location)
glass$Rating <- as.factor(glass$Rating)
# Creating variable 'Text'
glass$Pros <- ifelse(is.na(glass$Pros), "", glass$Pros)
glass$Cons <- ifelse(is.na(glass$Cons), "", glass$Cons)
glass$AdviceToManagement <- ifelse(is.na(glass$AdviceToManagement), "", glass$AdviceToManagement)
glass <- unite(glass, Text, c(Cons, Pros, AdviceToManagement), sep = " ")
# Counting the number of words in each review
glass$Count <- sapply(strsplit(glass$Text, " "), length)
summary(glass)
## ID Company Title Rating EmployeeStatus
## K1 : 1 Kaspersky: 521 Length:2022 1 : 58 Current:1328
## K10 : 1 VK : 354 Class :character 2 : 70 Former : 693
## K100 : 1 Yandex :1147 Mode :character 3 : 193 NA's : 1
## K101 : 1 4 : 649
## K102 : 1 5 :1051
## K103 : 1 NA's: 1
## (Other):2016
## EmployeeType WorkExperience
## Contractor : 27 less than 1 year :280
## Employee :1879 more than 1 year :454
## Freelancer : 11 more than 10 years: 23
## Intern : 99 more than 3 years :316
## Temporary Employee: 5 more than 5 years :191
## NA's : 1 more than 8 years : 60
## NA's :698
## JobTitle Location Date
## Software Engineer : 176 Moscow :1147 Min. :2009-02-23
## Software Developer : 92 Saint Petersburg: 112 1st Qu.:2016-12-09
## Project Manager : 68 Woburn : 48 Median :2019-12-22
## Senior Software Engineer: 63 London : 30 Mean :2018-12-29
## Frontend Developer : 48 Yekaterinburg : 24 3rd Qu.:2021-04-02
## (Other) :1185 (Other) : 194 Max. :2022-02-28
## NA's : 390 NA's : 467 NA's :1
## Text Category Count
## Length:2022 Entry :998 Min. : 6.00
## Class :character Mid :401 1st Qu.: 13.00
## Mode :character Senior:231 Median : 20.00
## NA's :392 Mean : 36.11
## 3rd Qu.: 40.75
## Max. :435.00
##
str(glass)
## tibble [2,022 × 13] (S3: tbl_df/tbl/data.frame)
## $ ID : Factor w/ 2022 levels "K1","K10","K100",..: 876 1135 1246 1357 1468 1579 1690 1801 1912 877 ...
## $ Company : Factor w/ 3 levels "Kaspersky","VK",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Title : chr [1:2022] "Great company" "Used to be nice" "good" "Nice for start" ...
## $ Rating : Factor w/ 5 levels "1","2","3","4",..: 5 3 5 3 5 5 5 4 5 3 ...
## $ EmployeeStatus: Factor w/ 2 levels "Current","Former": 2 1 2 2 2 1 2 1 2 2 ...
## $ EmployeeType : Factor w/ 5 levels "Contractor","Employee",..: 2 2 3 2 2 4 2 2 1 2 ...
## $ WorkExperience: Factor w/ 6 levels "less than 1 year",..: 4 2 NA NA 3 NA 2 NA NA 2 ...
## $ JobTitle : Factor w/ 549 levels ".NET Developer",..: 90 148 458 14 439 338 465 465 96 503 ...
## $ Location : Factor w/ 93 levels "Abingdon","Abuja",..: 52 NA 52 52 52 2 92 52 82 52 ...
## $ Date : Date[1:2022], format: "2020-12-08" "2020-12-08" ...
## $ Text : chr [1:2022] "Did not notice any Cons interesting challenging tasks \r\nhighly professional specialists \r\nideal working con"| __truncated__ "Outdated technical stack with lots of domestic solutions unknown outside Yandex, and with poor documentation. N"| __truncated__ "It's a big company. Salary is medium-high. It's a big company. Social packet is good. Free lunches, insurance, etc. " "management and working hours. No life good office and nice infrastructure " ...
## $ Category : Factor w/ 3 levels "Entry","Mid",..: 1 1 1 1 3 1 1 1 1 2 ...
## $ Count : int [1:2022] 16 66 19 11 69 15 18 13 20 131 ...
glass$Text <- tolower(glass$Text)
glass$Text <- removePunctuation(glass$Text)
glass$Text <- removeNumbers(glass$Text)
glass$Text <- trimws(glass$Text)
glass$Text <- stripWhitespace(glass$Text)
glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work life")
glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work-life")
ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)
x <- udpipe_annotate(ud_model, x = glass$Text)
x <- as.data.frame(x)
str(x)
## 'data.frame': 72905 obs. of 14 variables:
## $ doc_id : chr "doc1" "doc1" "doc1" "doc1" ...
## $ paragraph_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sentence_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ sentence : chr "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ ...
## $ token_id : chr "1" "2" "3" "4" ...
## $ token : chr "did" "not" "notice" "any" ...
## $ lemma : chr "do" "not" "notice" "any" ...
## $ upos : chr "AUX" "PART" "VERB" "DET" ...
## $ xpos : chr "VBD" "RB" "VB" "DT" ...
## $ feats : chr "Mood=Ind|Tense=Past|VerbForm=Fin" NA "VerbForm=Inf" NA ...
## $ head_token_id: chr "3" "3" "0" "5" ...
## $ dep_rel : chr "aux" "advmod" "root" "det" ...
## $ deps : chr NA NA NA NA ...
## $ misc : chr NA NA NA NA ...
yandex <- glass %>% filter(Company == "Yandex")
y <- udpipe_annotate(ud_model, x = yandex$Text)
y <- as.data.frame(y)
emo <- get_sentiments("nrc") %>% filter(sentiment %in% c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
'surprise', 'trust'))
y_emo <- y %>% select(lemma) %>%
rename(word = lemma) %>%
inner_join(emo)
emo_mean <- y_emo %>%
group_by(sentiment) %>%
summarize(freq = n()) %>%
mutate(percent = round(freq/sum(freq)*100)) %>%
as.data.frame()
y_emo$sentiment <- factor(y_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
"sadness", "anger", "disgust"))
ggplot(aes(x = sentiment), data = y_emo) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Brand love & hate (Yandex)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_fill_manual(values = c( "grey", "white", "white", "white", "white", "white", 'white', 'white'))
stats_n <- subset(y, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)
stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Yandex)",
xlab = "Frequency")
stats_a <- subset(y, upos %in% c("ADJ"))
stats_a <- txt_freq(stats_a$lemma)
stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Yandex)", xlab = "Frequency")
stats_v <- subset(y, upos %in% c("VERB"))
stats_v <- txt_freq(stats_v$lemma)
stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Yandex)", xlab = "Frequency")
comp <- read_excel("/Users/liza/Desktop/ДИПЛОМ/companies data.xlsx", sheet = 3)
comp <- comp[1:3,]
comp$Text <- tolower(comp$Text)
comp$Text <- removePunctuation(comp$Text)
comp$Text <- removeNumbers(comp$Text)
comp$Text <- trimws(comp$Text)
comp$Text <- stripWhitespace(comp$Text)
comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work life")
comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work-life")
# Counting the number of words in each review
comp$Count <- sapply(strsplit(comp$Text, " "), length)
summary(comp)
## Company Text ...3 Count
## Length:3 Length:3 Mode:logical Min. :455.0
## Class :character Class :character NA's:3 1st Qu.:484.5
## Mode :character Mode :character Median :514.0
## Mean :595.7
## 3rd Qu.:666.0
## Max. :818.0
comp_y <- comp %>% filter(Company == "Yandex")
y2 <- udpipe_annotate(ud_model, x = comp_y$Text)
y2 <- as.data.frame(y2)
stats_n2 <- subset(y2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)
stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Yandex 2)", xlab = "Frequency")
stats_a2 <- subset(y2, upos %in% c("ADJ"))
stats_a2 <- txt_freq(stats_a2$lemma)
stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Yandex 2)", xlab = "Frequency")
stats_v2 <- subset(y2, upos %in% c("VERB"))
stats_v2 <- txt_freq(stats_v2$lemma)
stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Yandex 2)", xlab = "Frequency")
summary(yandex$Rating)
## 1 2 3 4 5 NA's
## 15 27 95 353 656 1
yandex_na <- yandex %>% drop_na(Rating)
ggplot(yandex_na, aes(x = Rating)) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Ratings (Yandex)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
scale_fill_manual(values = c("white", "white", "white", "white", "grey"))
y_sent <- y %>% select(lemma, sentence) %>%
rename(word = lemma) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(sentence) %>%
summarize(overall = mean(value))
summary(y_sent$overall)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.0000 0.8452 1.5000 1.4309 2.0000 4.0000
ggplot(y_sent, aes(x = overall)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
geom_density(size = 0.7, color = "black") +
labs(x = "Sentiment", y = "Density", title = "Sentiments (Yandex)") +
theme_minimal() +
theme(legend.position = "none")
nrow(y_sent[y_sent$overall > 0,])
## [1] 993
yandex2 <- yandex %>%
filter(EmployeeStatus == "Current")
summary(yandex2$WorkExperience)
## less than 1 year more than 1 year more than 10 years more than 3 years
## 93 166 8 128
## more than 5 years more than 8 years NA's
## 68 22 266
yandex_na <- yandex2 %>% drop_na(WorkExperience)
level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years',
'more than 8 years', 'more than 10 years')
ggplot(yandex_na, aes(x = WorkExperience)) +
geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Organizational tenures of current staff (Yandex)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
coord_flip() +
scale_fill_manual(values = c("white", "grey", "white", "white", "white", "white"))
yandex3 <- yandex2 %>% filter(WorkExperience != "less than 1 year")
y3 <- udpipe_annotate(ud_model, x = yandex3$Text)
y3 <- as.data.frame(y3)
keyw_rake2 <- keywords_rake(x = y3,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = y3$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for current employees (Yandex)", xlab = "Rake")
yandex4 <- yandex %>%
filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
y4 <- udpipe_annotate(ud_model, x = yandex4$Text)
y4 <- as.data.frame(y4)
keyw_rake3 <- keywords_rake(x = y4,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = y4$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for former employees (Yandex)", xlab = "Rake")
vk <- glass %>% filter(Company == "VK")
v <- udpipe_annotate(ud_model, x = vk$Text)
v <- as.data.frame(v)
v_emo <- v %>% select(lemma) %>%
rename(word = lemma) %>%
inner_join(emo)
emo_mean2 <- v_emo %>%
group_by(sentiment) %>%
summarize(freq = n()) %>%
mutate(percent = round(freq/sum(freq)*100)) %>%
as.data.frame()
v_emo$sentiment <- factor(v_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
"sadness", "anger", "disgust"))
ggplot(aes(x = sentiment), data = v_emo) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Brand love & hate (VK)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))
stats_n <- subset(v, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)
stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (VK)",
xlab = "Frequency")
stats_a <- subset(v, upos %in% c("ADJ"))
stats_a <- txt_freq(stats_a$lemma)
stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (VK)", xlab = "Frequency")
stats_v <- subset(v, upos %in% c("VERB"))
stats_v <- txt_freq(stats_v$lemma)
stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (VK)", xlab = "Frequency")
comp_v <- comp %>% filter(Company == "VK")
v2 <- udpipe_annotate(ud_model, x = comp_v$Text)
v2 <- as.data.frame(v2)
stats_n2 <- subset(v2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)
stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (VK 2)", xlab = "Frequency")
stats_a2 <- subset(v2, upos %in% c("ADJ"))
stats_a2 <- txt_freq(stats_a2$lemma)
stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (VK 2)", xlab = "Frequency")
stats_v2 <- subset(v2, upos %in% c("VERB"))
stats_v2 <- txt_freq(stats_v2$lemma)
stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (VK 2)", xlab = "Frequency")
summary(vk$Rating)
## 1 2 3 4 5
## 9 14 38 140 153
ggplot(vk, aes(x = Rating)) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Ratings (VK)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
scale_fill_manual(values = c("white", "white", "white", "white", "grey", "white"))
v_sent <- v %>% select(lemma, sentence) %>%
rename(word = lemma) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(sentence) %>%
summarize(overall = mean(value))
summary(v_sent$overall)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0000 0.6667 1.5000 1.3869 2.2000 3.5000
ggplot(v_sent, aes(x = overall)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
geom_density(size = 1, color = "black") +
labs(x = "Sentiment", y = "Density", title = "Sentiments (VK)") +
theme_minimal() +
theme(legend.position = "none")
nrow(v_sent[v_sent$overall > 0,])
## [1] 289
vk2 <- vk %>%
filter(EmployeeStatus == "Current")
summary(vk2$WorkExperience)
## less than 1 year more than 1 year more than 10 years more than 3 years
## 29 46 1 37
## more than 5 years more than 8 years NA's
## 12 7 95
vk_na <- vk2 %>% drop_na(WorkExperience)
level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years',
'more than 8 years', 'more than 10 years')
ggplot(vk_na, aes(x = WorkExperience)) +
geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Organizational tenures of current staff (VK)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
coord_flip() +
scale_fill_manual(values = c("white", "grey", "white", "white", "white", "white"))
vk3 <- vk2 %>% filter(WorkExperience != "less than 1 year")
v3 <- udpipe_annotate(ud_model, x = vk3$Text)
v3 <- as.data.frame(v3)
keyw_rake2 <- keywords_rake(x = v3,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = v3$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for current employees (VK)", xlab = "Rake")
vk4 <- vk %>%
filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
v4 <- udpipe_annotate(ud_model, x = vk4$Text)
v4 <- as.data.frame(v4)
keyw_rake3 <- keywords_rake(x = v4,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = v4$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for former employees (VK)", xlab = "Rake")
kaspersky <- glass %>% filter(Company == "Kaspersky")
k <- udpipe_annotate(ud_model, x = kaspersky$Text)
k <- as.data.frame(k)
k_emo <- k %>% select(lemma) %>%
rename(word = lemma) %>%
inner_join(emo)
emo_mean3 <- k_emo %>%
group_by(sentiment) %>%
summarize(freq = n()) %>%
mutate(percent = round(freq/sum(freq)*100)) %>%
as.data.frame()
k_emo$sentiment <- factor(k_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
"anger", "sadness", "disgust"))
ggplot(aes(x = sentiment), data = k_emo) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Brand love & hate (Kasperksy)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))
stats_n <- subset(k, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)
stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky)",
xlab = "Frequency")
stats_a <- subset(k, upos %in% c("ADJ"))
stats_a <- txt_freq(stats_a$lemma)
stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Kaspersky)", xlab = "Frequency")
stats_v <- subset(k, upos %in% c("VERB"))
stats_v <- txt_freq(stats_v$lemma)
stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Kaspersky)", xlab = "Frequency")
comp_k <- comp %>% filter(Company == "Kaspersky")
k2 <- udpipe_annotate(ud_model, x = comp_k$Text)
k2 <- as.data.frame(k2)
stats_n2 <- subset(k2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)
stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky 2)", xlab = "Frequency")
stats_a2 <- subset(k2, upos %in% c("ADJ"))
stats_a2 <- txt_freq(stats_a2$lemma)
stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Kaspersky 2)", xlab = "Frequency")
stats_v2 <- subset(k2, upos %in% c("VERB"))
stats_v2 <- txt_freq(stats_v2$lemma)
stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Kaspersky 2)", xlab = "Frequency")
summary(kaspersky$Rating)
## 1 2 3 4 5
## 34 29 60 156 242
ggplot(kaspersky, aes(x = Rating)) +
geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Ratings (Kaspersky)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
scale_fill_manual(values = c("white", "white", "white", "white", "grey"))
k_sent <- k %>% select(lemma, sentence) %>%
rename(word = lemma) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(sentence) %>%
summarize(overall = mean(value))
summary(k_sent$overall)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.0000 0.8628 1.6000 1.4650 2.1667 4.0000
ggplot(k_sent, aes(x = overall)) +
geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
geom_density(size = 1, color = "black") +
labs(x = "Sentiment", y = "Density", title = "Sentiments (Kaspersky)") +
theme_minimal() +
theme(legend.position = "none")
nrow(k_sent[k_sent$overall > 0,])
## [1] 459
kaspersky2 <- kaspersky %>%
filter(EmployeeStatus == "Current")
summary(kaspersky2$WorkExperience)
## less than 1 year more than 1 year more than 10 years more than 3 years
## 44 66 11 58
## more than 5 years more than 8 years NA's
## 42 17 112
kaspersky_na <- kaspersky2 %>% drop_na(WorkExperience)
level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years',
'more than 8 years', 'more than 10 years')
ggplot(kaspersky_na, aes(x = WorkExperience)) +
geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
color = "black", show.legend = F) +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
theme_minimal() +
ggtitle("Organizational tenures of current staff (Kaspersky)") +
labs(x = "", y = "", caption = "Grey highlights the largest group") +
coord_flip() +
scale_fill_manual(values = c("white", "grey", "white", "white", "white", "white"))
kaspersky3 <- kaspersky2 %>% filter(WorkExperience != "less than 1 year")
k3 <- udpipe_annotate(ud_model, x = kaspersky3$Text)
k3 <- as.data.frame(k3)
keyw_rake2 <- keywords_rake(x = k3,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = k3$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for current employees (Kaspersky)", xlab = "Rake")
kaspersky4 <- kaspersky %>%
filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
k4 <- udpipe_annotate(ud_model, x = kaspersky4$Text)
k4 <- as.data.frame(k4)
keyw_rake3 <- keywords_rake(x = k4,
term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
relevant = k4$upos %in% c("NOUN", "ADJ"),
ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red",
main = "Keywords identified by RAKE for former employees (Kaspersky)", xlab = "Rake")