Intro

Downloading libraries

library(scales)
library(corrplot)
library(plyr)
library(tidyr)
library(dplyr)
library(stringr)
library(tm)
library(tidytext)
library(stopwords)
library(textstem)
library(readxl)
library(textdata)
library(ggpubr)
library(wordcloud)
library(topicmodels)
library(ggplot2)
library(textrank)
library(udpipe)
library(lattice)
library(igraph)
library(ggraph)
library(gofastr)
library(quanteda)
library(readtext)
library(devtools)
library(spacyr)
# devtools::install_github("quanteda/quanteda.corpora")
library(quanteda.corpora)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(seededlda)
library(lubridate)
library(stm)
library(Rtsne)
library(rsvd)
library(geometry)
library(tidyverse)
library(stringi)
library(ldatuning)
library(knitr)

Exporting data

glass <- read_excel("/Users/liza/Desktop/ДИПЛОМ/Glassdoor reviews 2.xlsx")
jobs <- read_excel("/Users/liza/Desktop/ДИПЛОМ/jobs.xlsx", sheet = 6)

glass <- full_join(x = glass, y = jobs, by = "JobTitle")
glass <- unique(glass)

na <- which(is.na(glass$Company))
glass <- glass[-na, ]

Transforming variables

glass$Company <- ifelse(glass$Company == "Mail.Ru", "VK", glass$Company) 
# As the name of Mail.Ru is now VK, the name was changed so it was correct.
glass$Company <- as.factor(glass$Company)

glass$Date <- dmy(glass$Date) # Now the variable is of a type date in the format dmy (date-month-year)

glass$EmployeeStatus <- as.factor(glass$EmployeeStatus)

glass$EmployeeType <- as.factor(glass$EmployeeType)

glass$WorkExperience <- ifelse(glass$WorkExperience == "ess than 1 year", "less than 1 year", glass$WorkExperience)
# In some rows the letter "l" was missing in the phrase "less than 1 year", so that was fixed.
glass$WorkExperience <- as.factor(glass$WorkExperience)

glass$Category <- ifelse(glass$Category == "Anonymous", NA, glass$Category)
# There were reviewers who did not indicated their job titles, so their category was "Anonymous". It was changed to NA so they did not form a separate group.
glass$Category <- as.factor(glass$Category)

glass$ID <- as.factor(glass$ID)

glass$JobTitle <- as.factor(glass$JobTitle)
glass$JobTitle <- fct_collapse(glass$JobTitle, 
                                    "Anonymous" = c("Anonymous Employee", "Anonymous", "Anonymous Contractor", 
                                             "Anonymous Freelancer", "Anonymous Intern"))
levels(glass$JobTitle)[levels(glass$JobTitle) == "Anonymous"] <- NA
# The same thing was done for the variable with job titles as with "Category".

glass$Location <- as.factor(glass$Location)

glass$Rating <- as.factor(glass$Rating)

# Creating variable 'Text'
glass$Pros <- ifelse(is.na(glass$Pros), "", glass$Pros)
glass$Cons <- ifelse(is.na(glass$Cons), "", glass$Cons)
glass$AdviceToManagement <- ifelse(is.na(glass$AdviceToManagement), "", glass$AdviceToManagement)
glass <- unite(glass, Text, c(Cons, Pros, AdviceToManagement), sep = " ")

# Counting the number of words in each review
glass$Count <- sapply(strsplit(glass$Text, " "), length)

summary(glass)

##        ID            Company        Title            Rating     EmployeeStatus
##  K1     :   1   Kaspersky: 521   Length:2022        1   :  58   Current:1328  
##  K10    :   1   VK       : 354   Class :character   2   :  70   Former : 693  
##  K100   :   1   Yandex   :1147   Mode  :character   3   : 193   NA's   :   1  
##  K101   :   1                                       4   : 649                 
##  K102   :   1                                       5   :1051                 
##  K103   :   1                                       NA's:   1                 
##  (Other):2016                                                                 
##              EmployeeType             WorkExperience
##  Contractor        :  27   less than 1 year  :280   
##  Employee          :1879   more than 1 year  :454   
##  Freelancer        :  11   more than 10 years: 23   
##  Intern            :  99   more than 3 years :316   
##  Temporary Employee:   5   more than 5 years :191   
##  NA's              :   1   more than 8 years : 60   
##                            NA's              :698   
##                      JobTitle                Location         Date           
##  Software Engineer       : 176   Moscow          :1147   Min.   :2009-02-23  
##  Software Developer      :  92   Saint Petersburg: 112   1st Qu.:2016-12-09  
##  Project Manager         :  68   Woburn          :  48   Median :2019-12-22  
##  Senior Software Engineer:  63   London          :  30   Mean   :2018-12-29  
##  Frontend Developer      :  48   Yekaterinburg   :  24   3rd Qu.:2021-04-02  
##  (Other)                 :1185   (Other)         : 194   Max.   :2022-02-28  
##  NA's                    : 390   NA's            : 467   NA's   :1           
##      Text             Category       Count       
##  Length:2022        Entry :998   Min.   :  6.00  
##  Class :character   Mid   :401   1st Qu.: 13.00  
##  Mode  :character   Senior:231   Median : 20.00  
##                     NA's  :392   Mean   : 36.11  
##                                  3rd Qu.: 40.75  
##                                  Max.   :435.00  
##

str(glass)

## tibble [2,022 × 13] (S3: tbl_df/tbl/data.frame)
##  $ ID            : Factor w/ 2022 levels "K1","K10","K100",..: 876 1135 1246 1357 1468 1579 1690 1801 1912 877 ...
##  $ Company       : Factor w/ 3 levels "Kaspersky","VK",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Title         : chr [1:2022] "Great company" "Used to be nice" "good" "Nice for start" ...
##  $ Rating        : Factor w/ 5 levels "1","2","3","4",..: 5 3 5 3 5 5 5 4 5 3 ...
##  $ EmployeeStatus: Factor w/ 2 levels "Current","Former": 2 1 2 2 2 1 2 1 2 2 ...
##  $ EmployeeType  : Factor w/ 5 levels "Contractor","Employee",..: 2 2 3 2 2 4 2 2 1 2 ...
##  $ WorkExperience: Factor w/ 6 levels "less than 1 year",..: 4 2 NA NA 3 NA 2 NA NA 2 ...
##  $ JobTitle      : Factor w/ 549 levels ".NET Developer",..: 90 148 458 14 439 338 465 465 96 503 ...
##  $ Location      : Factor w/ 93 levels "Abingdon","Abuja",..: 52 NA 52 52 52 2 92 52 82 52 ...
##  $ Date          : Date[1:2022], format: "2020-12-08" "2020-12-08" ...
##  $ Text          : chr [1:2022] "Did not notice any Cons interesting challenging tasks \r\nhighly professional specialists \r\nideal working con"| __truncated__ "Outdated technical stack with lots of domestic solutions unknown outside Yandex, and with poor documentation. N"| __truncated__ "It's a big company. Salary is medium-high. It's a big company. Social packet is good. Free lunches, insurance, etc. " "management and working hours. No life good office and nice infrastructure " ...
##  $ Category      : Factor w/ 3 levels "Entry","Mid",..: 1 1 1 1 3 1 1 1 1 2 ...
##  $ Count         : int [1:2022] 16 66 19 11 69 15 18 13 20 131 ...

Textual data preprocessing

glass$Text <- tolower(glass$Text)
glass$Text <- removePunctuation(glass$Text)
glass$Text <- removeNumbers(glass$Text)
glass$Text <- trimws(glass$Text)
glass$Text <- stripWhitespace(glass$Text)

glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work life")
glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work-life")

Constructing udpipe model

ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)

x <- udpipe_annotate(ud_model, x = glass$Text)
x <- as.data.frame(x)
str(x)

## 'data.frame':    72905 obs. of  14 variables:
##  $ doc_id       : chr  "doc1" "doc1" "doc1" "doc1" ...
##  $ paragraph_id : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence_id  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence     : chr  "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ ...
##  $ token_id     : chr  "1" "2" "3" "4" ...
##  $ token        : chr  "did" "not" "notice" "any" ...
##  $ lemma        : chr  "do" "not" "notice" "any" ...
##  $ upos         : chr  "AUX" "PART" "VERB" "DET" ...
##  $ xpos         : chr  "VBD" "RB" "VB" "DT" ...
##  $ feats        : chr  "Mood=Ind|Tense=Past|VerbForm=Fin" NA "VerbForm=Inf" NA ...
##  $ head_token_id: chr  "3" "3" "0" "5" ...
##  $ dep_rel      : chr  "aux" "advmod" "root" "det" ...
##  $ deps         : chr  NA NA NA NA ...
##  $ misc         : chr  NA NA NA NA ...

Yandex

yandex <- glass %>% filter(Company == "Yandex")

y <- udpipe_annotate(ud_model, x = yandex$Text)
y <- as.data.frame(y)

Brand consistent behavior (BCB)

Brand love & brand hate

emo <- get_sentiments("nrc") %>% filter(sentiment %in% c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
                                                         'surprise', 'trust'))

y_emo <- y %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)

emo_mean <- y_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

y_emo$sentiment <- factor(y_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "sadness", "anger", "disgust"))

ggplot(aes(x = sentiment), data = y_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c( "grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(y, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Yandex)", 
         xlab = "Frequency")

stats_a <- subset(y, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Yandex)", xlab = "Frequency")

stats_v <- subset(y, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Yandex)", xlab = "Frequency")

Employer

comp <- read_excel("/Users/liza/Desktop/ДИПЛОМ/companies data.xlsx", sheet = 3)
comp <- comp[1:3,]

comp$Text <- tolower(comp$Text)
comp$Text <- removePunctuation(comp$Text)
comp$Text <- removeNumbers(comp$Text)
comp$Text <- trimws(comp$Text)
comp$Text <- stripWhitespace(comp$Text)

comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work life")
comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work-life")

# Counting the number of words in each review
comp$Count <- sapply(strsplit(comp$Text, " "), length)

summary(comp)

##    Company              Text             ...3             Count      
##  Length:3           Length:3           Mode:logical   Min.   :455.0  
##  Class :character   Class :character   NA's:3         1st Qu.:484.5  
##  Mode  :character   Mode  :character                  Median :514.0  
##                                                       Mean   :595.7  
##                                                       3rd Qu.:666.0  
##                                                       Max.   :818.0

comp_y <- comp %>% filter(Company == "Yandex")

y2 <- udpipe_annotate(ud_model, x = comp_y$Text)
y2 <- as.data.frame(y2)

stats_n2 <- subset(y2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Yandex 2)", xlab = "Frequency")

stats_a2 <- subset(y2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Yandex 2)", xlab = "Frequency")

stats_v2 <- subset(y2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Yandex 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(yandex$Rating)

##    1    2    3    4    5 NA's 
##   15   27   95  353  656    1

yandex_na <- yandex %>% drop_na(Rating)

ggplot(yandex_na, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey"))

Sentiments of reviews

y_sent <- y %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))

summary(y_sent$overall)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.0000  0.8452  1.5000  1.4309  2.0000  4.0000

ggplot(y_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 0.7, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (Yandex)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(y_sent[y_sent$overall > 0,])

## [1] 993

Brand allegiance

Organizational tenures

yandex2 <- yandex %>% 
  filter(EmployeeStatus == "Current")

summary(yandex2$WorkExperience)

##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 93                166                  8                128 
##  more than 5 years  more than 8 years               NA's 
##                 68                 22                266

yandex_na <- yandex2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(yandex_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

yandex3 <- yandex2 %>% filter(WorkExperience != "less than 1 year")

y3 <- udpipe_annotate(ud_model, x = yandex3$Text)
y3 <- as.data.frame(y3)

keyw_rake2 <- keywords_rake(x = y3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = y3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (Yandex)", xlab = "Rake")

yandex4 <- yandex %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
y4 <- udpipe_annotate(ud_model, x = yandex4$Text)
y4 <- as.data.frame(y4)

keyw_rake3 <- keywords_rake(x = y4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = y4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (Yandex)", xlab = "Rake")

VK

vk <- glass %>% filter(Company == "VK")

v <- udpipe_annotate(ud_model, x = vk$Text)
v <- as.data.frame(v)

Brand consistent behavior (BCB)

Brand love & brand hate

v_emo <- v %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)

emo_mean2 <- v_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

v_emo$sentiment <- factor(v_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "sadness", "anger", "disgust"))

ggplot(aes(x = sentiment), data = v_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(v, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (VK)", 
         xlab = "Frequency")

stats_a <- subset(v, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (VK)", xlab = "Frequency")

stats_v <- subset(v, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (VK)", xlab = "Frequency")

Employer

comp_v <- comp %>% filter(Company == "VK")

v2 <- udpipe_annotate(ud_model, x = comp_v$Text)
v2 <- as.data.frame(v2)

stats_n2 <- subset(v2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (VK 2)", xlab = "Frequency")

stats_a2 <- subset(v2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (VK 2)", xlab = "Frequency")

stats_v2 <- subset(v2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (VK 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(vk$Rating)

##   1   2   3   4   5 
##   9  14  38 140 153

ggplot(vk, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey", "white"))

Sentiments of reviews

v_sent <- v %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))

summary(v_sent$overall)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0000  0.6667  1.5000  1.3869  2.2000  3.5000

ggplot(v_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 1, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (VK)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(v_sent[v_sent$overall > 0,])

## [1] 289

Brand allegiance

Organizational tenures

vk2 <- vk %>% 
  filter(EmployeeStatus == "Current")

summary(vk2$WorkExperience)

##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 29                 46                  1                 37 
##  more than 5 years  more than 8 years               NA's 
##                 12                  7                 95

vk_na <- vk2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(vk_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

vk3 <- vk2 %>% filter(WorkExperience != "less than 1 year")

v3 <- udpipe_annotate(ud_model, x = vk3$Text)
v3 <- as.data.frame(v3)

keyw_rake2 <- keywords_rake(x = v3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = v3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (VK)", xlab = "Rake")

vk4 <- vk %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
v4 <- udpipe_annotate(ud_model, x = vk4$Text)
v4 <- as.data.frame(v4)

keyw_rake3 <- keywords_rake(x = v4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = v4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (VK)", xlab = "Rake")

Kaspersky

kaspersky <- glass %>% filter(Company == "Kaspersky")

k <- udpipe_annotate(ud_model, x = kaspersky$Text)
k <- as.data.frame(k)

Brand consistent behavior (BCB)

Brand love & brand hate

k_emo <- k %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)

emo_mean3 <- k_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

k_emo$sentiment <- factor(k_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "anger", "sadness", "disgust"))

ggplot(aes(x = sentiment), data = k_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (Kasperksy)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(k, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky)", 
         xlab = "Frequency")

stats_a <- subset(k, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Kaspersky)", xlab = "Frequency")

stats_v <- subset(k, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Kaspersky)", xlab = "Frequency")

Employer

comp_k <- comp %>% filter(Company == "Kaspersky")

k2 <- udpipe_annotate(ud_model, x = comp_k$Text)
k2 <- as.data.frame(k2)

stats_n2 <- subset(k2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky 2)", xlab = "Frequency")

stats_a2 <- subset(k2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Kaspersky 2)", xlab = "Frequency")

stats_v2 <- subset(k2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Kaspersky 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(kaspersky$Rating)

##   1   2   3   4   5 
##  34  29  60 156 242

ggplot(kaspersky, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (Kaspersky)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey"))

Sentiments of reviews

k_sent <- k %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))

summary(k_sent$overall)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.0000  0.8628  1.6000  1.4650  2.1667  4.0000

ggplot(k_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 1, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (Kaspersky)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(k_sent[k_sent$overall > 0,])

## [1] 459

Brand allegiance

Organizational tenures

kaspersky2 <- kaspersky %>% 
  filter(EmployeeStatus == "Current")

summary(kaspersky2$WorkExperience)

##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 44                 66                 11                 58 
##  more than 5 years  more than 8 years               NA's 
##                 42                 17                112

kaspersky_na <- kaspersky2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(kaspersky_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (Kaspersky)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

kaspersky3 <- kaspersky2 %>% filter(WorkExperience != "less than 1 year")

k3 <- udpipe_annotate(ud_model, x = kaspersky3$Text)
k3 <- as.data.frame(k3)

keyw_rake2 <- keywords_rake(x = k3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = k3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (Kaspersky)", xlab = "Rake")

kaspersky4 <- kaspersky %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
k4 <- udpipe_annotate(ud_model, x = kaspersky4$Text)
k4 <- as.data.frame(k4)

keyw_rake3 <- keywords_rake(x = k4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = k4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (Kaspersky)", xlab = "Rake")

Exploring Employer Brand in IT Multinationals through the Employment-related Platform

Maria Buylova, Roman Domarev, Elizaveta Dyachenko

16/05/2023

Intro

Downloading libraries

Exporting data

Transforming variables

Textual data preprocessing

Constructing udpipe model

Yandex

Brand consistent behavior (BCB)

Brand love & brand hate

EVPs in frequent words

Employees

Employer

Brand endorsement

Ratings

Sentiments of reviews

Brand allegiance

Organizational tenures

EVPs in frequent keywords (current staff VS alumni)

VK

Brand consistent behavior (BCB)

Brand love & brand hate

EVPs in frequent words

Employees

Employer

Brand endorsement

Ratings

Sentiments of reviews

Brand allegiance

Organizational tenures

EVPs in frequent keywords (current staff VS alumni)

Kaspersky

Brand consistent behavior (BCB)

Brand love & brand hate

EVPs in frequent words

Employees

Employer

Brand endorsement

Ratings

Sentiments of reviews

Brand allegiance

Organizational tenures

EVPs in frequent keywords (current staff VS alumni)