if(!require(dplyr)) {
install.packages("dplyr")
library(dplyr)
}
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require(ggplot2)) {
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
if(!require(scales)) {
install.packages("scales")
library(scales)
}
## Loading required package: scales
if(!require(lubridate)) {
install.packages("lubridate")
library(lubridate)
}
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.4.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
This exercise uses the HF 2021 Employee Engagement Dataset from Kaggle. The data comes from a platform called Happyforce that helps companies measure “Employee Engagement” and collect feedback throughout the year. You will need the following files:
companyMetadata.csv
employees.csv
hiVotes.csv
scoreMetadata.csv
scoreVotes.csv
For this exercise, you will be computing a few quantitative measures of this dataset using R and then reflecting on the data collection and analysis process.
hiVotesDF <- read.csv("C:/Users/diana/Downloads/archive/hiVotes.csv")
scoreMetadataDF <- read.csv("C:/Users/diana/Downloads/archive/scoreMetadata.csv")
scoreVotesDF <- read.csv("C:/Users/diana/Downloads/archive/scoreVotes.csv")
companyMetadataDF <- read.csv("C:/Users/diana/Downloads/archive/companyMetadata.csv")
employeesDF <- read.csv("C:/Users/diana/Downloads/archive/employees.csv")
The dataset is composed of 5 files:
companyMetadata.csv: This file contains the list of
company hashed ids, it’s industry and timezones.
employees.csv: This file contains the list of all
employees that have, at least, participated once on any question on the
dataset.. Also some metadata like hiringdate, birthdate and gender is
provided when available. Also the emlployee status (if it stills
available on the system or not).
scoreMetadata.csv: Contains the list of Scores,
Factors and Questions with their ids, for reference on other
files.
hiVotes.csv: Contains all the votes of the employees
for all companies on the HI Question.
scoreVotes.csv: Contains all the votes of the
employees for all companies on the Score Questions.
head(hiVotesDF, 10)
## companyId employeeId departmentId
## 1 57908a2622881200033b34d7 461 57df9888838c0d0003e2abae
## 2 57908a2622881200033b34d7 354 57df9888838c0d0003e2abae
## 3 57908a2622881200033b34d7 461 57df9888838c0d0003e2abae
## 4 57908a2622881200033b34d7 461 57df9888838c0d0003e2abae
## 5 57908a2622881200033b34d7 461 57df9888838c0d0003e2abae
## 6 57908a2622881200033b34d7 354 57df9888838c0d0003e2abae
## 7 57908a2622881200033b34d7 461 57df9888838c0d0003e2abae
## 8 57908a2622881200033b34d7 8 58f616f0ca49f6000474c546
## 9 57908a2622881200033b34d7 8 58f616f0ca49f6000474c546
## 10 57908a2622881200033b34d7 77 58f616f0ca49f6000474c546
## date hiVote
## 1 2021-01-27 00:00:00 4
## 2 2021-01-28 00:00:00 4
## 3 2021-01-28 00:00:00 4
## 4 2021-01-29 00:00:00 4
## 5 2021-02-01 00:00:00 4
## 6 2021-02-02 00:00:00 4
## 7 2021-02-02 00:00:00 4
## 8 2017-04-18 00:00:00 2
## 9 2017-04-19 00:00:00 4
## 10 2017-04-20 00:00:00 4
head(scoreMetadataDF, 10)
## scoreId name factorId
## 1 5df0cbe8d8e67200048f1b09 Relationships 5df0cbe8d8e67200048f1b05
## 2 5df0cbe8d8e67200048f1b09 Relationships 5df0cbe8d8e67200048f1b05
## 3 5df0cbe8d8e67200048f1b09 Relationships 5df0cbe8d8e67200048f1b05
## 4 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c0c
## 5 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c0c
## 6 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c0c
## 7 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c10
## 8 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c10
## 9 5df0cbe8d8e67200048f1b09 Relationships 5df0e70ad8e67200048f1c10
## 10 5dd6d642103b810004915e04 Intrinsic Motivation 5dd6d642103b810004915dfb
## factor questionId
## 1 Managers 5df0cbe8d8e67200048f1b06
## 2 Managers 5df0cbe8d8e67200048f1b07
## 3 Managers 5df0cbe8d8e67200048f1b08
## 4 Peers 5df0e70ad8e67200048f1c0d
## 5 Peers 5df0e70ad8e67200048f1c0e
## 6 Peers 5df0e70ad8e67200048f1c0f
## 7 Confidence 5df0e70ad8e67200048f1c11
## 8 Confidence 5df0e70ad8e67200048f1c12
## 9 Confidence 5df0e70ad8e67200048f1c13
## 10 Autonomy 5dd6d642103b810004915dfc
## question
## 1 What is it like working with your direct manager?
## 2 My manager helps bring the best out of people
## 3 When I ask my manager for help, he/she devotes time to help me
## 4 I feel like I am part of a team
## 5 I can count on my co-workers to help out when needed
## 6 *|COMPANY_NAME|* is the kind of place where I could make friends.
## 7 I trust that the people I work with are committed to do a good job
## 8 I would rate my direct manager's management skills as:
## 9 My confidence in the senior leadership team of *|COMPANY_NAME|* is:
## 10 I rate the freedom that I have to decide how to do my work as:
head(scoreVotesDF, 10)
## companyId employeeId departmentId
## 1 57908a2622881200033b34d7 354 57df9888838c0d0003e2abae
## 2 57908a2622881200033b34d7 471 5b505f30d1c5730004f2d986
## 3 57908a2622881200033b34d7 505 5b505f30d1c5730004f2d986
## 4 57908a2622881200033b34d7 503 5b505f30d1c5730004f2d986
## 5 57908a2622881200033b34d7 521 5b505f30d1c5730004f2d986
## 6 57908a2622881200033b34d7 572 5b505f30d1c5730004f2d986
## 7 57908a2622881200033b34d7 571 5b505f30d1c5730004f2d986
## 8 57908a2622881200033b34d7 455 5ba1563430cf530004025c7e
## 9 57908a2622881200033b34d7 465 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7 455 5ba1563430cf530004025c7e
## scoreId factorId questionId
## 1 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 2 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 3 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 4 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 5 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 6 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 7 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 8 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 9 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 10 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## date scoreVote
## 1 2021-02-02 00:00:00 5
## 2 2020-01-22 00:00:00 4
## 3 2020-01-24 00:00:00 5
## 4 2020-02-21 00:00:00 3
## 5 2020-02-28 00:00:00 5
## 6 2020-07-23 00:00:00 4
## 7 2020-09-23 00:00:00 4
## 8 2019-12-28 00:00:00 4
## 9 2020-01-03 00:00:00 5
## 10 2020-01-20 00:00:00 5
head(companyMetadataDF, 10)
## companyId industry timezone
## 1 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 2 57c4aa7dbb8b5c000396fd3b HEALTH_CARE_SOCIAL_ASSISTANCE America/Guayaquil
## 3 56fd2b64f41c670003f643c8 MANUFACTURING Europe/Madrid
## 4 57ac8b23be7fe30003e656d0 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 5 57c01f7e8e0e090003ab5955 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 6 590047cd5a640900042eb54e MANUFACTURING Europe/Madrid
## 7 59439923a74bf90004bfc467 NON_PROFIT_ORGANIZATION Europe/Madrid
## 8 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 9 574c423856b6300003009953 UTILITIES_AND_SERVICES Europe/Madrid
## 10 595e5c6ed2b2cb00042fd136 MANUFACTURING Europe/Warsaw
head(employeesDF, 10)
## companyId employeeId gender hiringDate birthDate
## 1 57908a2622881200033b34d7 354 1974-07-23 00:00:00
## 2 57908a2622881200033b34d7 471
## 3 57908a2622881200033b34d7 505
## 4 57908a2622881200033b34d7 503
## 5 57908a2622881200033b34d7 521
## 6 57908a2622881200033b34d7 572
## 7 57908a2622881200033b34d7 571
## 8 57908a2622881200033b34d7 455
## 9 57908a2622881200033b34d7 465 1982-04-19 00:00:00
## 10 57908a2622881200033b34d7 464
## deleted deletionDate
## 1 false
## 2 true 2020-11-12 11:31:41
## 3 true 2020-07-14 19:20:39
## 4
## 5 false
## 6 false
## 7 false
## 8 false
## 9 false
## 10 false
For this exercise, you will be computing a few quantitative measures of the HF dataset. Use R to compute these quantitative measures.
Complete the following table of mean and standard deviation for different features in the dataset (relevant files shown in parentheses). Round your answers to two digits after the decimal (nearest hundredth).
Mean and StDev for HiVote
hiVote_mean <- mean(hiVotesDF$hiVote, na.rm = TRUE)
hiVote_stdev <- sd(hiVotesDF$hiVote, na.rm = TRUE)
cat("hiVote mean:", hiVote_mean, "\n")
## hiVote mean: 2.915459
cat("hiVote stdev:", hiVote_stdev, "\n")
## hiVote stdev: 0.9799774
Mean and StDev for ScoreVote
scoreVote_mean <- mean(scoreVotesDF$scoreVote, na.rm = TRUE)
scoreVote_stdev <- sd(scoreVotesDF$scoreVote, na.rm = TRUE)
cat("scoreVote mean:", scoreVote_mean, "\n")
## scoreVote mean: 6.229596
cat("scoreVote stdev:", scoreVote_stdev, "\n")
## scoreVote stdev: 2.747134
Scorevote for Wellbeing
wellbeing_questions <- scoreMetadataDF %>% filter(grepl("wellbeing", question, ignore.case = TRUE))
wellbeing_votes <- scoreVotesDF %>% filter(questionId %in% wellbeing_questions$questionId)
wellbeing_question_summary <- wellbeing_votes %>%
left_join(scoreMetadataDF, by = "questionId") %>%
group_by(questionId, question) %>%
summarise(
mean_scoreVote = mean(scoreVote, na.rm = TRUE),
stdev_scoreVote = sd(scoreVote, na.rm = TRUE),
.groups = 'drop'
)
wellbeing_question_summary %>%
mutate(
result = paste("Question type:", question,
"\nMean scoreVote:", mean_scoreVote,
"\nStdev scoreVote:", stdev_scoreVote, "\n")
) %>%
pull(result) %>%
cat(sep = "\n")
## Question type: *|COMPANY_NAME|* shows a commitment to the wellbeing of employees
## Mean scoreVote: 3.75682637815559
## Stdev scoreVote: 1.02379824656329
##
## Question type: My manager genuinely cares about my wellbeing
## Mean scoreVote: 7.64944186629876
## Stdev scoreVote: 2.21054326067456
Scorevote for Stress Levels
stress_question <- scoreMetadataDF %>% filter(grepl("stress", question, ignore.case = TRUE))
stress_votes <- scoreVotesDF %>% filter(questionId %in% stress_question$questionId)
# Calculate mean and standard deviation for the stress question
stress_scoreVote_mean <- mean(stress_votes$scoreVote, na.rm = TRUE)
stress_scoreVote_stdev <- sd(stress_votes$scoreVote, na.rm = TRUE)
# Print the results
cat("Stress scoreVote mean:", stress_scoreVote_mean, "\n")
## Stress scoreVote mean: 7.560678
cat("Stress scoreVote stdev:", stress_scoreVote_stdev, "\n")
## Stress scoreVote stdev: 1.959066
Use the ggplot library (not R’s built-in functions) to create a bar plot (horizontal or vertical orientation) of the number of companies per industry represented in this dataset (companyMetadata.csv), filtering for only the top 10 most frequent industries. (Hint: look through the dplyr library functions for help transforming your data set into the right set to use.) Make sure to customize the axes and title with meaningful labels, and that all text is clear and legible.
First: Filtering out for the top 10 industries
top_10_industries <- companyMetadataDF %>%
filter(!is.na(industry) & industry != "") %>%
group_by(industry) %>%
summarise(company_count = n()) %>%
arrange(desc(company_count)) %>%
slice_head(n = 10)
top_10_industries
## # A tibble: 10 × 2
## industry company_count
## <chr> <int>
## 1 COMPUTER_SOFTWARE_IT_SERVICES 38
## 2 FINANCIAL_SERVICES_INSURANCE 15
## 3 MANUFACTURING 13
## 4 HEALTH_CARE_SOCIAL_ASSISTANCE 8
## 5 NON_PROFIT_ORGANIZATION 8
## 6 MANAGEMENT_CONSULTING 6
## 7 UTILITIES_AND_SERVICES 6
## 8 WHOLESALE_AND_RETAIL_TRADE 6
## 9 EDUCATIONAL_SERVICES 5
## 10 ACCOMMODATION_AND_FOOT_SERVICES 4
Creating a horizontal bar plot:
library(ggplot2)
top_10_industries_plot <- top_10_industries %>%
mutate(industry = gsub("_", " ", industry))
ggplot(top_10_industries_plot, aes(x = industry, y = company_count)) + theme_minimal() + labs(title = "Top 10 Industries by Number of Companies", x = "Industry", y = "Number of Companies") + geom_col() + coord_flip()
Create box plots showing distribution of scoreVote by the different score categories (i.e., scoreId). Hint: to execute this task, you’ll need to join the scoreMeta table and scoreVotes table, using the ‘name’ in the scoreMeta table as the x-axis. Again, make sure your axes, title, and labels are clear and meaningful. Give a sentence of analysis about this figure; at a high level, what does it mean/what can we learn from it?
# Condensing to only the key score IDs, then joining with the DF
scoreMetadataDF_unique <- scoreMetadataDF %>%
distinct(scoreId, .keep_all = TRUE)
score_data <- scoreVotesDF %>%
inner_join(scoreMetadataDF_unique, by = "scoreId")
ggplot(score_data, aes(x = name, y = scoreVote)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Distribution of ScoreVote by Score Category",
x = "Score Category",
y = "Score Vote"
) + coord_flip()
At a high level, we can conclude from this data that while there is a high level of variation, employees are on average quite satisfied and self-motivated for their jobs. However, they on average do not get reward and recognition, and only feel partially aligned with their work.
We use the wellbeing votes questions we had before and then join it with the company data DF
wellbeing_votes_with_company <- wellbeing_votes %>%
inner_join(companyMetadataDF, by = "companyId")
head(wellbeing_votes_with_company, 10)
## companyId employeeId departmentId
## 1 57908a2622881200033b34d7 453 5b505f30d1c5730004f2d986
## 2 57908a2622881200033b34d7 445 5ba1563430cf530004025c7e
## 3 57908a2622881200033b34d7 455 5ba1563430cf530004025c7e
## 4 57908a2622881200033b34d7 464 5ba1563430cf530004025c7e
## 5 57908a2622881200033b34d7 512 5ba1563430cf530004025c7e
## 6 57908a2622881200033b34d7 511 5ba1563430cf530004025c7e
## 7 57908a2622881200033b34d7 422 5ba1563430cf530004025c7e
## 8 57908a2622881200033b34d7 465 5ba1563430cf530004025c7e
## 9 57908a2622881200033b34d7 498 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7 464 5ba1563430cf530004025c7e
## scoreId factorId questionId
## 1 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 2 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 3 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 4 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 5 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 6 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 7 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 8 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 9 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 10 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## date scoreVote industry timezone
## 1 2020-01-20 00:00:00 4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 2 2019-11-25 00:00:00 4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 3 2019-12-23 00:00:00 4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 4 2020-01-29 00:00:00 4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 5 2020-01-30 00:00:00 5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 6 2020-02-11 00:00:00 1 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 7 2020-02-13 00:00:00 5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 8 2020-03-07 00:00:00 5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 9 2020-03-11 00:00:00 4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 10 2020-05-02 00:00:00 5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
We then filter them by company and then calculate the mean for each. Taking the top 10, we find:
avg_wellbeing_by_company <- wellbeing_votes_with_company %>%
group_by(companyId) %>%
summarise(average_wellbeing_score = mean(scoreVote, na.rm = TRUE))
head(avg_wellbeing_by_company, 10)
## # A tibble: 10 × 2
## companyId average_wellbeing_score
## <chr> <dbl>
## 1 56aec740f1ef260003e307d6 6.24
## 2 56fd2b64f41c670003f643c8 5.21
## 3 57908a2622881200033b34d7 6.46
## 4 57c4aa7dbb8b5c000396fd3b 6.55
## 5 594980450af0930004ad2e31 5.97
## 6 59e5e76eda4f2100041a722c 5.33
## 7 5a5c65acda7ea50004af7996 5.56
## 8 5a8c1d3da359580004047bcb 6.17
## 9 5ad7d22a40bbc80004b73727 5.35
## 10 5b3ddd08ce75e500040f8c76 5.62
Finally sorting by wellbeing, we see the company with the highest average wellbeing score:
library(ggplot2)
top_wellbeing_company <- avg_wellbeing_by_company %>%
slice_max(average_wellbeing_score, n = 1)
top_wellbeing_company
## # A tibble: 1 × 2
## companyId average_wellbeing_score
## <chr> <dbl>
## 1 5f60e1c51f5cb400047183e0 6.61
Compare Happiness Index (hiVote) between the two industries ARTS_ENTERTAINMENT_RECREATION and FINANCIAL_SERVICES_INSURANCE. Write a hypothesis, predicting which of the two industries will have a higher hiVote score. Conduct a t-test to evaluate your hypothesis: report the mean hiVote for the industries and the p-value, identifying whether the results are significant based on a significance level of 0.05. Write a sentence describing the result of your statistical findings on your chosen hypothesis.
Hypothesis: I believe that the Financial Services Insurance people will on average have a lower happiness index than the Arts/Entertainment/Recreation
industry_2_hiVotes <- companyMetadataDF %>%
filter(industry %in% c("ARTS_ENTERTAINMENT_RECREATION", "FINANCIAL_SERVICES_INSURANCE")) %>%
inner_join(hiVotesDF, by = "companyId")
head(industry_2_hiVotes, 10)
## companyId industry timezone
## 1 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 2 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 3 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 4 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 5 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 6 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 7 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 8 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 9 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 10 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## employeeId departmentId date hiVote
## 1 83 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00 2
## 2 84 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00 3
## 3 79 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00 4
## 4 94 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00 4
## 5 61 56aec8e1f1ef260003e307d8 2016-02-02 00:00:00 3
## 6 94 56aec8e1f1ef260003e307d8 2016-02-02 00:00:00 3
## 7 61 56aec8e1f1ef260003e307d8 2016-02-03 00:00:00 3
## 8 94 56aec8e1f1ef260003e307d8 2016-02-04 00:00:00 3
## 9 61 56aec8e1f1ef260003e307d8 2016-02-04 00:00:00 4
## 10 58 56aec8e1f1ef260003e307d8 2016-02-05 00:00:00 3
So far this seems to prove my hypothesis that the hivote of the arts industry is lower… Let’s investigate further.
t_test_results <- t.test(
hiVote ~ industry,
data = industry_2_hiVotes,
)
#Printing it to look nice bcs I kind of have OCD
t_statistic <- t_test_results$statistic
p_value <- t_test_results$p.value
mean_arts_entertainment <- t_test_results$estimate[1]
mean_financial_services <- t_test_results$estimate[2]
confidence_interval <- t_test_results$conf.int
# Format and print the results
cat("T-Test Results:\n")
## T-Test Results:
cat("---------------------------\n")
## ---------------------------
cat("Mean hiVote for ARTS_ENTERTAINMENT_RECREATION:", round(mean_arts_entertainment, 2), "\n")
## Mean hiVote for ARTS_ENTERTAINMENT_RECREATION: 3.37
cat("Mean hiVote for FINANCIAL_SERVICES_INSURANCE:", round(mean_financial_services, 2), "\n")
## Mean hiVote for FINANCIAL_SERVICES_INSURANCE: 3.04
cat("T-Statistic:", round(t_statistic, 2), "\n")
## T-Statistic: 2.44
cat("P-Value:", format.pval(p_value, digits = 4), "\n")
## P-Value: 0.01829
cat("95% Confidence Interval:", round(confidence_interval[1], 2), "to", round(confidence_interval[2], 2), "\n")
## 95% Confidence Interval: 0.06 to 0.59
Interpreting this T-Test:
We find that on average, companies in Arts report a higher happiness index compared to the financial services industry. With a T-statistic of 2.44, we also see that relative to the variability of the data, this is a significant difference of happiness as well.
We also find that the p-value is 0.018 < 0.05, meaning that this result is statistically significant.
With a 95% confident interval, we find that the difference in happiness values can be anywhere from 0.06 to 0.59, meaning that we are confident that the arts sector is happier, even if it’s by a small amount.
The most appealing/intuitive approach to this question is just to just reference the happiness Index:
company_hiVotes <- companyMetadataDF %>%
inner_join(hiVotesDF, by = "companyId")
head(company_hiVotes, 10)
## companyId industry timezone
## 1 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 2 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 3 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 4 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 5 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 6 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 7 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 8 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 9 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 10 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## employeeId departmentId date hiVote
## 1 461 57df9888838c0d0003e2abae 2021-01-27 00:00:00 4
## 2 354 57df9888838c0d0003e2abae 2021-01-28 00:00:00 4
## 3 461 57df9888838c0d0003e2abae 2021-01-28 00:00:00 4
## 4 461 57df9888838c0d0003e2abae 2021-01-29 00:00:00 4
## 5 461 57df9888838c0d0003e2abae 2021-02-01 00:00:00 4
## 6 354 57df9888838c0d0003e2abae 2021-02-02 00:00:00 4
## 7 461 57df9888838c0d0003e2abae 2021-02-02 00:00:00 4
## 8 8 58f616f0ca49f6000474c546 2017-04-18 00:00:00 2
## 9 8 58f616f0ca49f6000474c546 2017-04-19 00:00:00 4
## 10 77 58f616f0ca49f6000474c546 2017-04-20 00:00:00 4
# Calculate the mean hiVote for each company
mean_hiVote_by_company <- company_hiVotes %>%
group_by(companyId) %>%
summarise(mean_hiVote = mean(hiVote, na.rm = TRUE))
head(mean_hiVote_by_company, 10)
## # A tibble: 10 × 2
## companyId mean_hiVote
## <chr> <dbl>
## 1 56aec740f1ef260003e307d6 3.21
## 2 56fd2b64f41c670003f643c8 2.70
## 3 574c423856b6300003009953 3.17
## 4 57908a2622881200033b34d7 3.49
## 5 57ac8b23be7fe30003e656d0 2.81
## 6 57c01f7e8e0e090003ab5955 2.97
## 7 57c4aa7dbb8b5c000396fd3b 3.30
## 8 58fe01b88028ec0004a54105 2.60
## 9 590047cd5a640900042eb54e 2.68
## 10 59439923a74bf90004bfc467 2.64
top_hi_company <- mean_hiVote_by_company %>%
slice_max(mean_hiVote, n = 1)
top_hi_company
## # A tibble: 1 × 2
## companyId mean_hiVote
## <chr> <dbl>
## 1 60a37307fe5648659c7abf25 3.67
Here, we find the “happiest” company (depending on if we define happiness by Happiness Index).
For the company you chose in the previous question, show how analyzing the data differently would give you a different finding. Others might believe that the recommendation to work at a company is the most important factor - employees will only recommend others to work at a company that they’re happy working at.
recommend_question <- scoreMetadataDF %>% filter(grepl("likely are you to recommend", question, ignore.case = TRUE))
recommend_votes <- scoreVotesDF %>% filter(questionId %in% recommend_question$questionId)
head(recommend_votes, 10)
## companyId employeeId departmentId
## 1 57908a2622881200033b34d7 256 5b505f30d1c5730004f2d986
## 2 57908a2622881200033b34d7 429 5b505f30d1c5730004f2d986
## 3 57908a2622881200033b34d7 471 5b505f30d1c5730004f2d986
## 4 57908a2622881200033b34d7 467 5b505f30d1c5730004f2d986
## 5 57908a2622881200033b34d7 508 5b505f30d1c5730004f2d986
## 6 57908a2622881200033b34d7 503 5b505f30d1c5730004f2d986
## 7 57908a2622881200033b34d7 521 5b505f30d1c5730004f2d986
## 8 57908a2622881200033b34d7 530 5b505f30d1c5730004f2d986
## 9 57908a2622881200033b34d7 251 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7 263 5ba1563430cf530004025c7e
## scoreId factorId questionId
## 1 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 2 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 3 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 4 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 5 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 6 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 7 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 8 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 9 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 10 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## date scoreVote
## 1 2018-12-19 00:00:00 8
## 2 2019-09-10 00:00:00 10
## 3 2019-12-12 00:00:00 10
## 4 2019-12-13 00:00:00 8
## 5 2020-03-04 00:00:00 8
## 6 2020-03-04 00:00:00 10
## 7 2020-03-05 00:00:00 10
## 8 2020-03-18 00:00:00 10
## 9 2018-12-09 00:00:00 10
## 10 2018-12-27 00:00:00 8
We then look at the top 10 companies by recommender score:
top_hi_company_id <- top_hi_company$companyId
recommend_score <- scoreVotesDF %>%
group_by(companyId) %>%
summarise(mean_rec_vote = mean(scoreVote, na.rm = TRUE)) %>%
arrange(desc(mean_rec_vote)) %>%
slice_head(n = 10)
head(recommend_score, 10)
## # A tibble: 10 × 2
## companyId mean_rec_vote
## <chr> <dbl>
## 1 5c5031fd2e9d970004262a20 8.75
## 2 5c73c78750b72e0004cab5c9 8.41
## 3 574c423856b6300003009953 8.31
## 4 5c54173ba6705a0004651d6d 8.17
## 5 5fa00a76f553b600042d6011 7.73
## 6 60113050f50ef100042b1494 7.31
## 7 5dee8369418c2c0004fc6f4e 7.21
## 8 5f60e1c51f5cb400047183e0 7.13
## 9 60770c99f17ad90003452a13 7.01
## 10 60a37307fe5648659c7abf25 7
Here, we find that the company we had previously identified scores 10th on whether others would recommend them or not.
a. Run a one-way ANOVA to determine whether significant differences in mean hiVote exist across industries. Report the p-value and interpret the results.
joined_df <- hiVotesDF %>%
inner_join(companyMetadataDF, by = "companyId")
industry_hiVotes <- joined_df %>%
filter(!is.na(hiVote)) %>%
group_by(industry) %>%
summarise(mean_hiVote = mean(hiVote, na.rm = TRUE))
# Calculating ANOVA
anova_results <- aov(hiVote ~ industry, data = joined_df, na.action = na.omit)
# Formatting the summary because I have OCD
anova_summary <- summary(anova_results)[[1]]
degrees_freedom_industry <- anova_summary$Df[1]
degrees_freedom_residual <- anova_summary$Df[2]
sum_sq_industry <- anova_summary$`Sum Sq`[1]
sum_sq_residual <- anova_summary$`Sum Sq`[2]
mean_sq_industry <- anova_summary$`Mean Sq`[1]
mean_sq_residual <- anova_summary$`Mean Sq`[2]
f_value <- anova_summary$`F value`[1]
p_value <- anova_summary$`Pr(>F)`[1]
cat("One-Way ANOVA Results:\n")
## One-Way ANOVA Results:
cat("---------------------------\n")
## ---------------------------
cat("Degrees of Freedom (Industry):", degrees_freedom_industry, "\n")
## Degrees of Freedom (Industry): 15
cat("Degrees of Freedom (Residuals):", degrees_freedom_residual, "\n")
## Degrees of Freedom (Residuals): 2302342
cat("Sum of Squares (Industry):", round(sum_sq_industry, 2), "\n")
## Sum of Squares (Industry): 34824.46
cat("Sum of Squares (Residuals):", round(sum_sq_residual, 2), "\n")
## Sum of Squares (Residuals): 2176257
cat("Mean Square (Industry):", round(mean_sq_industry, 2), "\n")
## Mean Square (Industry): 2321.63
cat("Mean Square (Residuals):", round(mean_sq_residual, 2), "\n")
## Mean Square (Residuals): 0.95
cat("F-Statistic:", round(f_value, 2), "\n")
## F-Statistic: 2456.14
cat("P-Value:", format.pval(p_value, digits = 4), "\n")
## P-Value: < 2.2e-16
Interpreting the results of this: from the sum of squares values, we see that the variability between and within industries is quite significant, meaning that while industry might explain some of the differences in employees’ happiness index, it’s not the full picture. However, looking at the extremely small p value, we can still conclude that industry is a strong and significant predictor of happiness index.
b. Imagine (regardless of the outcome of the previous step) that your results indicate a statistically significant difference. In class, we’ve discussed strategies for post-hoc analysis, including interpreting visualizations and conducting post-hoc testing. Suppose you simply ran Tukey’s test as your post-hoc analysis. How many comparisons would that statistical test be making?
I honestly didn’t know what Tukey’s test was. From my understanding (from the internet), it makes pairwise comparisons between all possible pairs of industries’ HI to identify which specific industry HI means would differ significantly. Because it takes every pair, this means that this would run \(\binom n2\) times. Since we have 15 industries, this would be \(\binom {15} 2=105\) comparisons.
c. In a sentence or two, what might be a better analysis step?
Looking at our data, we aren’t given equal variances between groups - meaning that we might use the Games-Howell test instead. While it is more complex, it also accounts for the fact that the variance in HI might differ between industries, since all industries are not built equally.
Preparing the merged timezone df by company ID
timezone_df <- scoreVotesDF %>%
inner_join(companyMetadataDF, by = "companyId")
# Print the first few rows of the joined data frame
unique(timezone_df$timezone)
## [1] "Europe/Madrid" "America/Guayaquil"
## [3] "America/Mexico_City" "America/Sao_Paulo"
## [5] "GMT" "Europe/Berlin"
## [7] "Pacific/Galapagos" "Europe/London"
## [9] "America/Guatemala" "America/Bogota"
## [11] "America/Santiago" "Europe/Luxembourg"
## [13] "America/Argentina/Buenos_Aires"
timezone_df$scoreVote <- scale(timezone_df$scoreVote, center = TRUE, scale = FALSE) #Scaling by timezone to eliminate intercept
lin_reg <- lm(scoreVote ~ timezone, data = timezone_df)
summary(lin_reg)
##
## Call:
## lm(formula = scoreVote ~ timezone, data = timezone_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9658 -2.5307 0.4693 2.4693 4.8389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04488 0.06591 0.681 0.495929
## timezoneAmerica/Bogota -0.01668 0.08257 -0.202 0.839922
## timezoneAmerica/Guatemala 0.67738 0.06958 9.736 < 2e-16 ***
## timezoneAmerica/Guayaquil 0.69128 0.06793 10.176 < 2e-16 ***
## timezoneAmerica/Mexico_City 0.25625 0.06692 3.829 0.000129 ***
## timezoneAmerica/Santiago 0.43746 0.08244 5.307 1.12e-07 ***
## timezoneAmerica/Sao_Paulo 0.13692 0.06846 2.000 0.045495 *
## timezoneEurope/Berlin -0.13062 0.06774 -1.928 0.053813 .
## timezoneEurope/London -1.11338 0.07171 -15.525 < 2e-16 ***
## timezoneEurope/Luxembourg 0.20144 0.10736 1.876 0.060616 .
## timezoneEurope/Madrid -0.17735 0.06609 -2.683 0.007292 **
## timezoneGMT -0.06100 0.06896 -0.885 0.376408
## timezonePacific/Galapagos 0.12552 0.55004 0.228 0.819482
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.73 on 495911 degrees of freedom
## Multiple R-squared: 0.0122, Adjusted R-squared: 0.01217
## F-statistic: 510.3 on 12 and 495911 DF, p-value: < 2.2e-16
Now extracting the strongest effect by finding the largest magnitude coefficient:
coefficients_df <- summary(lin_reg)$coefficients
coefficients_df <- as.data.frame(coefficients_df)
strongest_effect <- coefficients_df[which.max(abs(coefficients_df$Estimate)), ]
strongest_effect
## Estimate Std. Error t value Pr(>|t|)
## timezoneEurope/London -1.113379 0.07171362 -15.52534 2.407474e-54
# Identify the most significant effect (smallest p-value)
most_significant_effect <- coefficients_df[which.min(coefficients_df$`Pr(>|t|)`), ]
most_significant_effect
## Estimate Std. Error t value Pr(>|t|)
## timezoneEurope/London -1.113379 0.07171362 -15.52534 2.407474e-54
It appears that London has the strongest and the most significant effect. This is a bit confusing at first, but then looking at the datapoints, we find:
timezone_counts <- timezone_df %>%
group_by(timezone) %>%
summarise(count = n()) %>%
arrange(desc(count))
head(timezone_counts, 10)
## # A tibble: 10 × 2
## timezone count
## <chr> <int>
## 1 Europe/Madrid 308876
## 2 America/Mexico_City 55788
## 3 Europe/Berlin 30572
## 4 America/Guayaquil 27592
## 5 America/Sao_Paulo 21777
## 6 GMT 18133
## 7 America/Guatemala 15016
## 8 Europe/London 9336
## 9 America/Santiago 3041
## 10 America/Bogota 3014
Effect size and statistical significance are not really related to one another, as regardless of large effect size, the data can still be statistically insignificant if the sample size is small. Conversely, a small effect size can be statistically significant if the sample size is large enough.
Thesis: I don’t think the happiness index sufficiently captures employee satisfaction. In my opinion, a more important predictor of employee satisfaction is whether they would recommend working at the company to others.
Evidence 1) The data is a bit flawed in that there are not an equal amount of respondents per company, possibly skewing the mean HI score. Take the top two scores in the piece below, for example:
# Calculate the mean hiVote for each company
mean_hiVote_by_company_and_headcount <- company_hiVotes %>%
group_by(companyId) %>%
summarise(mean_hiVote = mean(hiVote, na.rm = TRUE),
response_count = n()) %>%
arrange(desc(mean_hiVote))
head(mean_hiVote_by_company_and_headcount, 10)
## # A tibble: 10 × 3
## companyId mean_hiVote response_count
## <chr> <dbl> <int>
## 1 60a37307fe5648659c7abf25 3.67 3
## 2 5fa00a76f553b600042d6011 3.58 26
## 3 60113050f50ef100042b1494 3.57 185
## 4 5f60e1c51f5cb400047183e0 3.54 6138
## 5 6058cd9eadd6f80003319249 3.52 419
## 6 5bb3503c375232000426c410 3.50 31467
## 7 57908a2622881200033b34d7 3.49 7146
## 8 602e81c1d3548400031f48a6 3.42 4481
## 9 5faee32997df24000430d586 3.39 9217
## 10 6047ac8b55de6f000456ca5d 3.37 49
One might argue that 6138 employees testifying a mean high vote of 3.54 is more significant than 3 employees giving a mean high vote of 3.7.
Evidence 2) Simply taking the mean hiVote does not take into account the variance in responses.
mean_hiVote_by_company_with_variance <- company_hiVotes %>%
group_by(companyId) %>%
summarise(mean_hiVote = mean(hiVote, na.rm = TRUE),
variance = var(hiVote, na.rm = TRUE)) %>%
arrange(desc(mean_hiVote))
head(mean_hiVote_by_company_with_variance, 10)
## # A tibble: 10 × 3
## companyId mean_hiVote variance
## <chr> <dbl> <dbl>
## 1 60a37307fe5648659c7abf25 3.67 0.333
## 2 5fa00a76f553b600042d6011 3.58 0.254
## 3 60113050f50ef100042b1494 3.57 0.311
## 4 5f60e1c51f5cb400047183e0 3.54 0.451
## 5 6058cd9eadd6f80003319249 3.52 0.475
## 6 5bb3503c375232000426c410 3.50 0.450
## 7 57908a2622881200033b34d7 3.49 0.509
## 8 602e81c1d3548400031f48a6 3.42 0.562
## 9 5faee32997df24000430d586 3.39 0.507
## 10 6047ac8b55de6f000456ca5d 3.37 0.862
Again, the company with the highest mean hiVote also has more variance than the second highest company. Even if the second highest company is a bit less happy (3.57 as opposed to 3.67), if the variance is lower that also means that employees are more uniformly this level of happy.
Evidence 3) Mean HiVote alone fails to capture happiness over time. While this is just a toy example, it might be useful to the company to view the happiness of their employees over a set amount of time, instead of overall.
# Identify employee IDs with more than >1000 responses to properly identify trends
multi_response_ids <- hiVotesDF %>%
count(employeeId) %>%
filter(n > 10000) %>%
pull(employeeId)
# Filtering only for employees with sufficient datapoints
filtered_hiVotes <- hiVotesDF %>%
mutate(date = as.Date(date)) %>% # Date so we can space shit out later
filter(employeeId %in% multi_response_ids) %>%
arrange(desc(employeeId))
head(filtered_hiVotes, 10)
## companyId employeeId departmentId date
## 1 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-19
## 2 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-20
## 3 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-21
## 4 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-22
## 5 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-23
## 6 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-24
## 7 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-25
## 8 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-26
## 9 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-27
## 10 57c4aa7dbb8b5c000396fd3b 24 57df9888838c0d0003e2abae 2016-09-29
## hiVote
## 1 4
## 2 4
## 3 4
## 4 4
## 5 4
## 6 4
## 7 4
## 8 4
## 9 4
## 10 4
# Group them by month's average
hiVote_by_month <- filtered_hiVotes %>%
mutate(month = floor_date(date, unit = "month")) %>%
group_by(employeeId, month) %>%
summarise(mean_hiVote = mean(hiVote, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'employeeId'. You can override using the
## `.groups` argument.
# Visualize trend using a line graph
ggplot(hiVote_by_month, aes(x = month, y = mean_hiVote, color = as.factor(employeeId), group = employeeId)) +
geom_line() +
geom_point() +
labs(
title = "Trend of Mean hiVote Over Time for Select Employees",
x = "Month",
y = "Mean HI",
color = "Employee ID"
) +
scale_x_date(
date_breaks = "1 year",
date_labels = "%b %Y"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
company_data <- filtered_hiVotes %>%
filter(companyId == "5bb3503c375232000426c410")
mean_happiness_by_day <- company_data %>%
group_by(date) %>%
summarise(mean_happiness = mean(hiVote, na.rm = TRUE)) %>%
ungroup()
head(mean_happiness_by_day, 10)
## # A tibble: 10 × 2
## date mean_happiness
## <date> <dbl>
## 1 2018-10-23 4
## 2 2018-10-24 4
## 3 2018-10-25 3.5
## 4 2018-10-26 3
## 5 2018-10-27 3
## 6 2018-10-29 3
## 7 2018-10-30 3.67
## 8 2018-11-01 2
## 9 2018-11-05 3
## 10 2018-11-06 4
Look! My employees are so happy (on specific dates!)
In a similar way, we could focus on departments with particularly bad HI scores. Take the department that scores 1.00, for example. If everyone in this department we cherrypicked hates their job surely everyone at this company hates their jobs just as much.
department_happiness <- filtered_hiVotes %>%
group_by(departmentId) %>%
summarise(mean_happiness = mean(hiVote, na.rm = TRUE)) %>%
ungroup()
head(department_happiness, 10)
## # A tibble: 10 × 2
## departmentId mean_happiness
## <chr> <dbl>
## 1 56aec900f1ef260003e307d9 3.07
## 2 576457853d77930003a4db82 3.30
## 3 576910af887958000317c79b 3.69
## 4 5769115c887958000317c7a1 3.98
## 5 57691165887958000317c7a2 3.95
## 6 57c01ffc8e0e090003ab5958 1
## 7 57c020018e0e090003ab5959 3.07
## 8 57df9888838c0d0003e2abae 3.72
## 9 58f61705ca49f6000474c54a 2.93
## 10 58ffc0376bf1b70004366aa0 2.23
What company is the most uniform, culturally? (You can try and prove this by determining company has the least variation across departments)
department_variance <- hiVotesDF %>%
group_by(companyId, departmentId) %>%
summarise(mean_happiness = mean(hiVote, na.rm = TRUE), .groups = "drop")
head(department_variance, 10)
## # A tibble: 10 × 3
## companyId departmentId mean_happiness
## <chr> <chr> <dbl>
## 1 56aec740f1ef260003e307d6 56aec8e1f1ef260003e307d8 3.33
## 2 56aec740f1ef260003e307d6 56aec900f1ef260003e307d9 3.16
## 3 56aec740f1ef260003e307d6 56cbdc1c761e8400033baed4 3.08
## 4 56aec740f1ef260003e307d6 56cbdc52761e8400033baed5 3.40
## 5 56aec740f1ef260003e307d6 576457853d77930003a4db82 3.23
## 6 56aec740f1ef260003e307d6 576457993d77930003a4db83 3.34
## 7 56aec740f1ef260003e307d6 576457ae3d77930003a4db84 2.91
## 8 56aec740f1ef260003e307d6 5a9406994763b40004906e0e 3.57
## 9 56fd2b64f41c670003f643c8 59358519497fd300040ea898 2.59
## 10 56fd2b64f41c670003f643c8 59358523497fd300040ea899 2.62
company_variation <- department_variance %>%
group_by(companyId) %>%
summarise(variance = var(mean_happiness, na.rm = TRUE), .groups = "drop") %>%
arrange(variance)
head(company_variation, 10)
## # A tibble: 10 × 2
## companyId variance
## <chr> <dbl>
## 1 6101220e42e92f2213502851 0.00207
## 2 5aaaafad4ddefd000481094f 0.00278
## 3 60c7297b4bcbc001be879aad 0.00428
## 4 5e6776f6d72b3700047d261d 0.00605
## 5 5f9999e442e9f500044d7177 0.0146
## 6 60770c99f17ad90003452a13 0.0148
## 7 5a96be30aa68f4000423a2c5 0.0188
## 8 60e577836e505a4c371d7bf3 0.0194
## 9 5ad7d22a40bbc80004b73727 0.0201
## 10 5bb3503c375232000426c410 0.0219