Intro

Downloading libraries

library(scales)
library(corrplot)
library(plyr)
library(tidyr)
library(dplyr)
library(stringr)
library(tm)
library(tidytext)
library(stopwords)
library(textstem)
library(readxl)
library(textdata)
library(ggpubr)
library(wordcloud)
library(topicmodels)
library(ggplot2)
library(textrank)
library(udpipe)
library(lattice)
library(igraph)
library(ggraph)
library(gofastr)
library(quanteda)
library(readtext)
library(devtools)
library(spacyr)
# devtools::install_github("quanteda/quanteda.corpora")
library(quanteda.corpora)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(seededlda)
library(lubridate)
library(stm)
library(Rtsne)
library(rsvd)
library(geometry)
library(tidyverse)
library(stringi)
library(ldatuning)
library(knitr)

Exporting data

glass <- read_excel("/Users/liza/Desktop/ДИПЛОМ/Glassdoor reviews 2.xlsx")
jobs <- read_excel("/Users/liza/Desktop/ДИПЛОМ/jobs.xlsx", sheet = 6)

glass <- full_join(x = glass, y = jobs, by = "JobTitle")
glass <- unique(glass)

na <- which(is.na(glass$Company))
glass <- glass[-na, ]

Transforming variables

glass$Company <- ifelse(glass$Company == "Mail.Ru", "VK", glass$Company) 
# As the name of Mail.Ru is now VK, the name was changed so it was correct.
glass$Company <- as.factor(glass$Company)

glass$Date <- dmy(glass$Date) # Now the variable is of a type date in the format dmy (date-month-year)

glass$EmployeeStatus <- as.factor(glass$EmployeeStatus)

glass$EmployeeType <- as.factor(glass$EmployeeType)

glass$WorkExperience <- ifelse(glass$WorkExperience == "ess than 1 year", "less than 1 year", glass$WorkExperience)
# In some rows the letter "l" was missing in the phrase "less than 1 year", so that was fixed.
glass$WorkExperience <- as.factor(glass$WorkExperience)

glass$Category <- ifelse(glass$Category == "Anonymous", NA, glass$Category)
# There were reviewers who did not indicated their job titles, so their category was "Anonymous". It was changed to NA so they did not form a separate group.
glass$Category <- as.factor(glass$Category)

glass$ID <- as.factor(glass$ID)

glass$JobTitle <- as.factor(glass$JobTitle)
glass$JobTitle <- fct_collapse(glass$JobTitle, 
                                    "Anonymous" = c("Anonymous Employee", "Anonymous", "Anonymous Contractor", 
                                             "Anonymous Freelancer", "Anonymous Intern"))
levels(glass$JobTitle)[levels(glass$JobTitle) == "Anonymous"] <- NA
# The same thing was done for the variable with job titles as with "Category".

glass$Location <- as.factor(glass$Location)

glass$Rating <- as.factor(glass$Rating)
# Creating variable 'Text'
glass$Pros <- ifelse(is.na(glass$Pros), "", glass$Pros)
glass$Cons <- ifelse(is.na(glass$Cons), "", glass$Cons)
glass$AdviceToManagement <- ifelse(is.na(glass$AdviceToManagement), "", glass$AdviceToManagement)
glass <- unite(glass, Text, c(Cons, Pros, AdviceToManagement), sep = " ")
# Counting the number of words in each review
glass$Count <- sapply(strsplit(glass$Text, " "), length)
summary(glass)
##        ID            Company        Title            Rating     EmployeeStatus
##  K1     :   1   Kaspersky: 521   Length:2022        1   :  58   Current:1328  
##  K10    :   1   VK       : 354   Class :character   2   :  70   Former : 693  
##  K100   :   1   Yandex   :1147   Mode  :character   3   : 193   NA's   :   1  
##  K101   :   1                                       4   : 649                 
##  K102   :   1                                       5   :1051                 
##  K103   :   1                                       NA's:   1                 
##  (Other):2016                                                                 
##              EmployeeType             WorkExperience
##  Contractor        :  27   less than 1 year  :280   
##  Employee          :1879   more than 1 year  :454   
##  Freelancer        :  11   more than 10 years: 23   
##  Intern            :  99   more than 3 years :316   
##  Temporary Employee:   5   more than 5 years :191   
##  NA's              :   1   more than 8 years : 60   
##                            NA's              :698   
##                      JobTitle                Location         Date           
##  Software Engineer       : 176   Moscow          :1147   Min.   :2009-02-23  
##  Software Developer      :  92   Saint Petersburg: 112   1st Qu.:2016-12-09  
##  Project Manager         :  68   Woburn          :  48   Median :2019-12-22  
##  Senior Software Engineer:  63   London          :  30   Mean   :2018-12-29  
##  Frontend Developer      :  48   Yekaterinburg   :  24   3rd Qu.:2021-04-02  
##  (Other)                 :1185   (Other)         : 194   Max.   :2022-02-28  
##  NA's                    : 390   NA's            : 467   NA's   :1           
##      Text             Category       Count       
##  Length:2022        Entry :998   Min.   :  6.00  
##  Class :character   Mid   :401   1st Qu.: 13.00  
##  Mode  :character   Senior:231   Median : 20.00  
##                     NA's  :392   Mean   : 36.11  
##                                  3rd Qu.: 40.75  
##                                  Max.   :435.00  
## 
str(glass)
## tibble [2,022 × 13] (S3: tbl_df/tbl/data.frame)
##  $ ID            : Factor w/ 2022 levels "K1","K10","K100",..: 876 1135 1246 1357 1468 1579 1690 1801 1912 877 ...
##  $ Company       : Factor w/ 3 levels "Kaspersky","VK",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Title         : chr [1:2022] "Great company" "Used to be nice" "good" "Nice for start" ...
##  $ Rating        : Factor w/ 5 levels "1","2","3","4",..: 5 3 5 3 5 5 5 4 5 3 ...
##  $ EmployeeStatus: Factor w/ 2 levels "Current","Former": 2 1 2 2 2 1 2 1 2 2 ...
##  $ EmployeeType  : Factor w/ 5 levels "Contractor","Employee",..: 2 2 3 2 2 4 2 2 1 2 ...
##  $ WorkExperience: Factor w/ 6 levels "less than 1 year",..: 4 2 NA NA 3 NA 2 NA NA 2 ...
##  $ JobTitle      : Factor w/ 549 levels ".NET Developer",..: 90 148 458 14 439 338 465 465 96 503 ...
##  $ Location      : Factor w/ 93 levels "Abingdon","Abuja",..: 52 NA 52 52 52 2 92 52 82 52 ...
##  $ Date          : Date[1:2022], format: "2020-12-08" "2020-12-08" ...
##  $ Text          : chr [1:2022] "Did not notice any Cons interesting challenging tasks \r\nhighly professional specialists \r\nideal working con"| __truncated__ "Outdated technical stack with lots of domestic solutions unknown outside Yandex, and with poor documentation. N"| __truncated__ "It's a big company. Salary is medium-high. It's a big company. Social packet is good. Free lunches, insurance, etc. " "management and working hours. No life good office and nice infrastructure " ...
##  $ Category      : Factor w/ 3 levels "Entry","Mid",..: 1 1 1 1 3 1 1 1 1 2 ...
##  $ Count         : int [1:2022] 16 66 19 11 69 15 18 13 20 131 ...

Textual data preprocessing

glass$Text <- tolower(glass$Text)
glass$Text <- removePunctuation(glass$Text)
glass$Text <- removeNumbers(glass$Text)
glass$Text <- trimws(glass$Text)
glass$Text <- stripWhitespace(glass$Text)

glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work life")
glass$Text <- stri_replace_all(glass$Text, "worklife", fixed = "work-life")

Constructing udpipe model

ud_model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(ud_model$file_model)

x <- udpipe_annotate(ud_model, x = glass$Text)
x <- as.data.frame(x)
str(x)
## 'data.frame':    72905 obs. of  14 variables:
##  $ doc_id       : chr  "doc1" "doc1" "doc1" "doc1" ...
##  $ paragraph_id : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence_id  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ sentence     : chr  "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ "did not notice any cons interesting challenging tasks highly professional specialists ideal working conditions "| __truncated__ ...
##  $ token_id     : chr  "1" "2" "3" "4" ...
##  $ token        : chr  "did" "not" "notice" "any" ...
##  $ lemma        : chr  "do" "not" "notice" "any" ...
##  $ upos         : chr  "AUX" "PART" "VERB" "DET" ...
##  $ xpos         : chr  "VBD" "RB" "VB" "DT" ...
##  $ feats        : chr  "Mood=Ind|Tense=Past|VerbForm=Fin" NA "VerbForm=Inf" NA ...
##  $ head_token_id: chr  "3" "3" "0" "5" ...
##  $ dep_rel      : chr  "aux" "advmod" "root" "det" ...
##  $ deps         : chr  NA NA NA NA ...
##  $ misc         : chr  NA NA NA NA ...

Yandex

yandex <- glass %>% filter(Company == "Yandex")

y <- udpipe_annotate(ud_model, x = yandex$Text)
y <- as.data.frame(y)

Brand consistent behavior (BCB)

Brand love & brand hate

emo <- get_sentiments("nrc") %>% filter(sentiment %in% c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness',
                                                         'surprise', 'trust'))

y_emo <- y %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)
emo_mean <- y_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

y_emo$sentiment <- factor(y_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "sadness", "anger", "disgust"))

ggplot(aes(x = sentiment), data = y_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c( "grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(y, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Yandex)", 
         xlab = "Frequency")

stats_a <- subset(y, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Yandex)", xlab = "Frequency")

stats_v <- subset(y, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Yandex)", xlab = "Frequency")

Employer

comp <- read_excel("/Users/liza/Desktop/ДИПЛОМ/companies data.xlsx", sheet = 3)
comp <- comp[1:3,]

comp$Text <- tolower(comp$Text)
comp$Text <- removePunctuation(comp$Text)
comp$Text <- removeNumbers(comp$Text)
comp$Text <- trimws(comp$Text)
comp$Text <- stripWhitespace(comp$Text)

comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work life")
comp$Text <- stri_replace_all(comp$Text, "worklife", fixed = "work-life")

# Counting the number of words in each review
comp$Count <- sapply(strsplit(comp$Text, " "), length)

summary(comp)
##    Company              Text             ...3             Count      
##  Length:3           Length:3           Mode:logical   Min.   :455.0  
##  Class :character   Class :character   NA's:3         1st Qu.:484.5  
##  Mode  :character   Mode  :character                  Median :514.0  
##                                                       Mean   :595.7  
##                                                       3rd Qu.:666.0  
##                                                       Max.   :818.0
comp_y <- comp %>% filter(Company == "Yandex")

y2 <- udpipe_annotate(ud_model, x = comp_y$Text)
y2 <- as.data.frame(y2)
stats_n2 <- subset(y2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Yandex 2)", xlab = "Frequency")

stats_a2 <- subset(y2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Yandex 2)", xlab = "Frequency")

stats_v2 <- subset(y2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Yandex 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(yandex$Rating)
##    1    2    3    4    5 NA's 
##   15   27   95  353  656    1
yandex_na <- yandex %>% drop_na(Rating)

ggplot(yandex_na, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey"))

Sentiments of reviews

y_sent <- y %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))
summary(y_sent$overall)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.0000  0.8452  1.5000  1.4309  2.0000  4.0000
ggplot(y_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 0.7, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (Yandex)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(y_sent[y_sent$overall > 0,])
## [1] 993

Brand allegiance

Organizational tenures

yandex2 <- yandex %>% 
  filter(EmployeeStatus == "Current")
summary(yandex2$WorkExperience)
##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 93                166                  8                128 
##  more than 5 years  more than 8 years               NA's 
##                 68                 22                266
yandex_na <- yandex2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(yandex_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (Yandex)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

yandex3 <- yandex2 %>% filter(WorkExperience != "less than 1 year")

y3 <- udpipe_annotate(ud_model, x = yandex3$Text)
y3 <- as.data.frame(y3)

keyw_rake2 <- keywords_rake(x = y3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = y3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (Yandex)", xlab = "Rake")

yandex4 <- yandex %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
y4 <- udpipe_annotate(ud_model, x = yandex4$Text)
y4 <- as.data.frame(y4)

keyw_rake3 <- keywords_rake(x = y4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = y4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (Yandex)", xlab = "Rake")

VK

vk <- glass %>% filter(Company == "VK")

v <- udpipe_annotate(ud_model, x = vk$Text)
v <- as.data.frame(v)

Brand consistent behavior (BCB)

Brand love & brand hate

v_emo <- v %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)
emo_mean2 <- v_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

v_emo$sentiment <- factor(v_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "sadness", "anger", "disgust"))

ggplot(aes(x = sentiment), data = v_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(v, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (VK)", 
         xlab = "Frequency")

stats_a <- subset(v, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (VK)", xlab = "Frequency")

stats_v <- subset(v, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (VK)", xlab = "Frequency")

Employer

comp_v <- comp %>% filter(Company == "VK")

v2 <- udpipe_annotate(ud_model, x = comp_v$Text)
v2 <- as.data.frame(v2)
stats_n2 <- subset(v2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (VK 2)", xlab = "Frequency")

stats_a2 <- subset(v2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (VK 2)", xlab = "Frequency")

stats_v2 <- subset(v2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (VK 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(vk$Rating)
##   1   2   3   4   5 
##   9  14  38 140 153
ggplot(vk, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey", "white"))

Sentiments of reviews

v_sent <- v %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))
summary(v_sent$overall)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0000  0.6667  1.5000  1.3869  2.2000  3.5000
ggplot(v_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 1, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (VK)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(v_sent[v_sent$overall > 0,])
## [1] 289

Brand allegiance

Organizational tenures

vk2 <- vk %>% 
  filter(EmployeeStatus == "Current")
summary(vk2$WorkExperience)
##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 29                 46                  1                 37 
##  more than 5 years  more than 8 years               NA's 
##                 12                  7                 95
vk_na <- vk2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(vk_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (VK)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

vk3 <- vk2 %>% filter(WorkExperience != "less than 1 year")

v3 <- udpipe_annotate(ud_model, x = vk3$Text)
v3 <- as.data.frame(v3)

keyw_rake2 <- keywords_rake(x = v3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = v3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (VK)", xlab = "Rake")

vk4 <- vk %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
v4 <- udpipe_annotate(ud_model, x = vk4$Text)
v4 <- as.data.frame(v4)

keyw_rake3 <- keywords_rake(x = v4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = v4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (VK)", xlab = "Rake")

Kaspersky

kaspersky <- glass %>% filter(Company == "Kaspersky")

k <- udpipe_annotate(ud_model, x = kaspersky$Text)
k <- as.data.frame(k)

Brand consistent behavior (BCB)

Brand love & brand hate

k_emo <- k %>% select(lemma) %>% 
  rename(word = lemma) %>% 
  inner_join(emo)
emo_mean3 <- k_emo %>%
  group_by(sentiment) %>%
  summarize(freq = n()) %>%
  mutate(percent = round(freq/sum(freq)*100)) %>% 
  as.data.frame()

k_emo$sentiment <- factor(k_emo$sentiment, levels = c("trust", "anticipation", "joy", "surprise", "fear",
                                                      "anger", "sadness", "disgust"))

ggplot(aes(x = sentiment), data = k_emo) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = sentiment), color = "black", show.legend = F) +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Brand love & hate (Kasperksy)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_fill_manual(values = c("grey", "white", "white", "white", "white", "white", 'white', 'white'))

EVPs in frequent words

Employees

stats_n <- subset(k, upos %in% "NOUN")
stats_n <- txt_freq(x = stats_n$lemma)

stats_n$key <- factor(stats_n$key, levels = rev(stats_n$key))
barchart(key ~ freq, data = head(stats_n, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky)", 
         xlab = "Frequency")

stats_a <- subset(k, upos %in% c("ADJ")) 
stats_a <- txt_freq(stats_a$lemma)

stats_a$key <- factor(stats_a$key, levels = rev(stats_a$key))
barchart(key ~ freq, data = head(stats_a, 20), col = "purple", main = "Most occurring adjectives (Kaspersky)", xlab = "Frequency")

stats_v <- subset(k, upos %in% c("VERB")) 
stats_v <- txt_freq(stats_v$lemma)

stats_v$key <- factor(stats_v$key, levels = rev(stats_v$key))
barchart(key ~ freq, data = head(stats_v, 20), col = "gold", main = "Most occurring verbs (Kaspersky)", xlab = "Frequency")

Employer

comp_k <- comp %>% filter(Company == "Kaspersky")

k2 <- udpipe_annotate(ud_model, x = comp_k$Text)
k2 <- as.data.frame(k2)
stats_n2 <- subset(k2, upos %in% "NOUN")
stats_n2 <- txt_freq(x = stats_n2$lemma)

stats_n2$key <- factor(stats_n2$key, levels = rev(stats_n2$key))
barchart(key ~ freq, data = head(stats_n2, 20), col = "cadetblue", main = "Most occurring nouns (Kaspersky 2)", xlab = "Frequency")

stats_a2 <- subset(k2, upos %in% c("ADJ")) 
stats_a2 <- txt_freq(stats_a2$lemma)

stats_a2$key <- factor(stats_a2$key, levels = rev(stats_a2$key))
barchart(key ~ freq, data = head(stats_a2, 20), col = "purple", main = "Most occurring adjectives (Kaspersky 2)", xlab = "Frequency")

stats_v2 <- subset(k2, upos %in% c("VERB")) 
stats_v2 <- txt_freq(stats_v2$lemma)

stats_v2$key <- factor(stats_v2$key, levels = rev(stats_v2$key))
barchart(key ~ freq, data = head(stats_v2, 20), col = "gold", main = "Most occurring verbs (Kaspersky 2)", xlab = "Frequency")

Brand endorsement

Ratings

summary(kaspersky$Rating)
##   1   2   3   4   5 
##  34  29  60 156 242
ggplot(kaspersky, aes(x = Rating)) +
  geom_bar(aes(y = (..count..)/sum(..count..), fill = Rating), color = "black", show.legend = FALSE) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Ratings (Kaspersky)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  scale_x_discrete(labels = c("Rating 1", "Rating 2", "Rating 3", "Rating 4", "Rating 5")) +
  scale_fill_manual(values = c("white", "white", "white", "white", "grey"))

Sentiments of reviews

k_sent <- k %>% select(lemma, sentence) %>% 
  rename(word = lemma) %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(sentence) %>% 
  summarize(overall = mean(value))
summary(k_sent$overall)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.0000  0.8628  1.6000  1.4650  2.1667  4.0000
ggplot(k_sent, aes(x = overall)) +
  geom_histogram(aes(y = ..density..), fill = "white", color = "#868686FF") +
  geom_density(size = 1, color = "black") +
  labs(x = "Sentiment", y = "Density", title = "Sentiments (Kaspersky)") +
  theme_minimal() +
  theme(legend.position = "none")

nrow(k_sent[k_sent$overall > 0,])
## [1] 459

Brand allegiance

Organizational tenures

kaspersky2 <- kaspersky %>% 
  filter(EmployeeStatus == "Current")
summary(kaspersky2$WorkExperience)
##   less than 1 year   more than 1 year more than 10 years  more than 3 years 
##                 44                 66                 11                 58 
##  more than 5 years  more than 8 years               NA's 
##                 42                 17                112
kaspersky_na <- kaspersky2 %>% drop_na(WorkExperience)

level_order <- c('less than 1 year', 'more than 1 year', 'more than 3 years', 'more than 5 years', 
                 'more than 8 years', 'more than 10 years') 

ggplot(kaspersky_na, aes(x = WorkExperience)) +
  geom_bar(aes(x = factor(WorkExperience, level = level_order), y = (..count..)/sum(..count..), fill = WorkExperience),
           color = "black", show.legend = F) + 
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 100, by = 0.1)) +
  theme_minimal() +
  ggtitle("Organizational tenures of current staff (Kaspersky)") +
  labs(x = "", y = "", caption = "Grey highlights the largest group") +
  coord_flip() +
  scale_fill_manual(values = c("white", "grey", "white",  "white",  "white", "white"))

EVPs in frequent keywords (current staff VS alumni)

kaspersky3 <- kaspersky2 %>% filter(WorkExperience != "less than 1 year")

k3 <- udpipe_annotate(ud_model, x = kaspersky3$Text)
k3 <- as.data.frame(k3)

keyw_rake2 <- keywords_rake(x = k3, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = k3$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake2$key <- factor(keyw_rake2$keyword, levels = rev(keyw_rake2$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake2, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for current employees (Kaspersky)", xlab = "Rake")

kaspersky4 <- kaspersky %>% 
  filter(EmployeeStatus == "Former" & WorkExperience != "less than 1 year")
  
k4 <- udpipe_annotate(ud_model, x = kaspersky4$Text)
k4 <- as.data.frame(k4)

keyw_rake3 <- keywords_rake(x = k4, 
                      term = "token", group = c("doc_id", "paragraph_id", "sentence_id"),
                      relevant = k4$upos %in% c("NOUN", "ADJ"),
                      ngram_max = 4)
keyw_rake3$key <- factor(keyw_rake3$keyword, levels = rev(keyw_rake3$keyword))
barchart(key ~ rake, data = head(subset(keyw_rake3, freq > 1), 20), col = "red", 
         main = "Keywords identified by RAKE for former employees (Kaspersky)", xlab = "Rake")