if(!require(dplyr)) {
  install.packages("dplyr")
  library(dplyr)
}
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if(!require(ggplot2)) {
  install.packages("ggplot2")
  library(ggplot2)
}
## Loading required package: ggplot2
if(!require(scales)) {
  install.packages("scales")
  library(scales)
}
## Loading required package: scales
if(!require(lubridate)) {
  install.packages("lubridate")
  library(lubridate)
}
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 4.4.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Depth Exercise: R Self-Study

This exercise uses the HF 2021 Employee Engagement Dataset from Kaggle. The data comes from a platform called Happyforce that helps companies measure “Employee Engagement” and collect feedback throughout the year. You will need the following files:

For this exercise, you will be computing a few quantitative measures of this dataset using R and then reflecting on the data collection and analysis process.

Part 0. Loading the dataframes

hiVotesDF <- read.csv("C:/Users/diana/Downloads/archive/hiVotes.csv")
scoreMetadataDF <- read.csv("C:/Users/diana/Downloads/archive/scoreMetadata.csv")
scoreVotesDF <- read.csv("C:/Users/diana/Downloads/archive/scoreVotes.csv")
companyMetadataDF <- read.csv("C:/Users/diana/Downloads/archive/companyMetadata.csv")
employeesDF <- read.csv("C:/Users/diana/Downloads/archive/employees.csv")

The dataset is composed of 5 files:

  • companyMetadata.csv: This file contains the list of company hashed ids, it’s industry and timezones.

  • employees.csv: This file contains the list of all employees that have, at least, participated once on any question on the dataset.. Also some metadata like hiringdate, birthdate and gender is provided when available. Also the emlployee status (if it stills available on the system or not).

  • scoreMetadata.csv: Contains the list of Scores, Factors and Questions with their ids, for reference on other files.

  • hiVotes.csv: Contains all the votes of the employees for all companies on the HI Question.

  • scoreVotes.csv: Contains all the votes of the employees for all companies on the Score Questions.

head(hiVotesDF, 10) 
##                   companyId employeeId             departmentId
## 1  57908a2622881200033b34d7        461 57df9888838c0d0003e2abae
## 2  57908a2622881200033b34d7        354 57df9888838c0d0003e2abae
## 3  57908a2622881200033b34d7        461 57df9888838c0d0003e2abae
## 4  57908a2622881200033b34d7        461 57df9888838c0d0003e2abae
## 5  57908a2622881200033b34d7        461 57df9888838c0d0003e2abae
## 6  57908a2622881200033b34d7        354 57df9888838c0d0003e2abae
## 7  57908a2622881200033b34d7        461 57df9888838c0d0003e2abae
## 8  57908a2622881200033b34d7          8 58f616f0ca49f6000474c546
## 9  57908a2622881200033b34d7          8 58f616f0ca49f6000474c546
## 10 57908a2622881200033b34d7         77 58f616f0ca49f6000474c546
##                   date hiVote
## 1  2021-01-27 00:00:00      4
## 2  2021-01-28 00:00:00      4
## 3  2021-01-28 00:00:00      4
## 4  2021-01-29 00:00:00      4
## 5  2021-02-01 00:00:00      4
## 6  2021-02-02 00:00:00      4
## 7  2021-02-02 00:00:00      4
## 8  2017-04-18 00:00:00      2
## 9  2017-04-19 00:00:00      4
## 10 2017-04-20 00:00:00      4
head(scoreMetadataDF, 10) 
##                     scoreId                 name                 factorId
## 1  5df0cbe8d8e67200048f1b09        Relationships 5df0cbe8d8e67200048f1b05
## 2  5df0cbe8d8e67200048f1b09        Relationships 5df0cbe8d8e67200048f1b05
## 3  5df0cbe8d8e67200048f1b09        Relationships 5df0cbe8d8e67200048f1b05
## 4  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c0c
## 5  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c0c
## 6  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c0c
## 7  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c10
## 8  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c10
## 9  5df0cbe8d8e67200048f1b09        Relationships 5df0e70ad8e67200048f1c10
## 10 5dd6d642103b810004915e04 Intrinsic Motivation 5dd6d642103b810004915dfb
##        factor               questionId
## 1    Managers 5df0cbe8d8e67200048f1b06
## 2    Managers 5df0cbe8d8e67200048f1b07
## 3    Managers 5df0cbe8d8e67200048f1b08
## 4       Peers 5df0e70ad8e67200048f1c0d
## 5       Peers 5df0e70ad8e67200048f1c0e
## 6       Peers 5df0e70ad8e67200048f1c0f
## 7  Confidence 5df0e70ad8e67200048f1c11
## 8  Confidence 5df0e70ad8e67200048f1c12
## 9  Confidence 5df0e70ad8e67200048f1c13
## 10   Autonomy 5dd6d642103b810004915dfc
##                                                               question
## 1                    What is it like working with your direct manager?
## 2                        My manager helps bring the best out of people
## 3       When I ask my manager for help, he/she devotes time to help me
## 4                                      I feel like I am part of a team
## 5                 I can count on my co-workers to help out when needed
## 6    *|COMPANY_NAME|* is the kind of place where I could make friends.
## 7   I trust that the people I work with are committed to do a good job
## 8               I would rate my direct manager's management skills as:
## 9  My confidence in the senior leadership team of *|COMPANY_NAME|* is:
## 10      I rate the freedom that I have to decide how to do my work as:
head(scoreVotesDF, 10) 
##                   companyId employeeId             departmentId
## 1  57908a2622881200033b34d7        354 57df9888838c0d0003e2abae
## 2  57908a2622881200033b34d7        471 5b505f30d1c5730004f2d986
## 3  57908a2622881200033b34d7        505 5b505f30d1c5730004f2d986
## 4  57908a2622881200033b34d7        503 5b505f30d1c5730004f2d986
## 5  57908a2622881200033b34d7        521 5b505f30d1c5730004f2d986
## 6  57908a2622881200033b34d7        572 5b505f30d1c5730004f2d986
## 7  57908a2622881200033b34d7        571 5b505f30d1c5730004f2d986
## 8  57908a2622881200033b34d7        455 5ba1563430cf530004025c7e
## 9  57908a2622881200033b34d7        465 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7        455 5ba1563430cf530004025c7e
##                     scoreId                 factorId               questionId
## 1  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 2  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 3  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 4  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 5  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 6  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 7  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 8  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 9  5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
## 10 5df0cbe8d8e67200048f1b09 5df0cbe8d8e67200048f1b05 5df0cbe8d8e67200048f1b06
##                   date scoreVote
## 1  2021-02-02 00:00:00         5
## 2  2020-01-22 00:00:00         4
## 3  2020-01-24 00:00:00         5
## 4  2020-02-21 00:00:00         3
## 5  2020-02-28 00:00:00         5
## 6  2020-07-23 00:00:00         4
## 7  2020-09-23 00:00:00         4
## 8  2019-12-28 00:00:00         4
## 9  2020-01-03 00:00:00         5
## 10 2020-01-20 00:00:00         5
head(companyMetadataDF, 10) 
##                   companyId                      industry          timezone
## 1  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES     Europe/Madrid
## 2  57c4aa7dbb8b5c000396fd3b HEALTH_CARE_SOCIAL_ASSISTANCE America/Guayaquil
## 3  56fd2b64f41c670003f643c8                 MANUFACTURING     Europe/Madrid
## 4  57ac8b23be7fe30003e656d0 COMPUTER_SOFTWARE_IT_SERVICES     Europe/Madrid
## 5  57c01f7e8e0e090003ab5955 COMPUTER_SOFTWARE_IT_SERVICES     Europe/Madrid
## 6  590047cd5a640900042eb54e                 MANUFACTURING     Europe/Madrid
## 7  59439923a74bf90004bfc467       NON_PROFIT_ORGANIZATION     Europe/Madrid
## 8  56aec740f1ef260003e307d6  FINANCIAL_SERVICES_INSURANCE     Europe/Madrid
## 9  574c423856b6300003009953        UTILITIES_AND_SERVICES     Europe/Madrid
## 10 595e5c6ed2b2cb00042fd136                 MANUFACTURING     Europe/Warsaw
head(employeesDF, 10)
##                   companyId employeeId gender hiringDate           birthDate
## 1  57908a2622881200033b34d7        354                   1974-07-23 00:00:00
## 2  57908a2622881200033b34d7        471                                      
## 3  57908a2622881200033b34d7        505                                      
## 4  57908a2622881200033b34d7        503                                      
## 5  57908a2622881200033b34d7        521                                      
## 6  57908a2622881200033b34d7        572                                      
## 7  57908a2622881200033b34d7        571                                      
## 8  57908a2622881200033b34d7        455                                      
## 9  57908a2622881200033b34d7        465                   1982-04-19 00:00:00
## 10 57908a2622881200033b34d7        464                                      
##    deleted        deletionDate
## 1    false                    
## 2     true 2020-11-12 11:31:41
## 3     true 2020-07-14 19:20:39
## 4                             
## 5    false                    
## 6    false                    
## 7    false                    
## 8    false                    
## 9    false                    
## 10   false

Part 1: Computing Quantitative Measures

For this exercise, you will be computing a few quantitative measures of the HF dataset. Use R to compute these quantitative measures.

1. Mean and standard Deviation Calculations

Complete the following table of mean and standard deviation for different features in the dataset (relevant files shown in parentheses). Round your answers to two digits after the decimal (nearest hundredth).

Mean and StDev for HiVote

hiVote_mean <- mean(hiVotesDF$hiVote, na.rm = TRUE)
hiVote_stdev <- sd(hiVotesDF$hiVote, na.rm = TRUE)

cat("hiVote mean:", hiVote_mean, "\n")
## hiVote mean: 2.915459
cat("hiVote stdev:", hiVote_stdev, "\n")
## hiVote stdev: 0.9799774

Mean and StDev for ScoreVote

scoreVote_mean <- mean(scoreVotesDF$scoreVote, na.rm = TRUE)
scoreVote_stdev <- sd(scoreVotesDF$scoreVote, na.rm = TRUE)

cat("scoreVote mean:", scoreVote_mean, "\n")
## scoreVote mean: 6.229596
cat("scoreVote stdev:", scoreVote_stdev, "\n")
## scoreVote stdev: 2.747134

Scorevote for Wellbeing

wellbeing_questions <- scoreMetadataDF %>% filter(grepl("wellbeing", question, ignore.case = TRUE))


wellbeing_votes <- scoreVotesDF %>% filter(questionId %in% wellbeing_questions$questionId)
wellbeing_question_summary <- wellbeing_votes %>%
    left_join(scoreMetadataDF, by = "questionId") %>%
    group_by(questionId, question) %>%
    summarise(
        mean_scoreVote = mean(scoreVote, na.rm = TRUE),
        stdev_scoreVote = sd(scoreVote, na.rm = TRUE),
        .groups = 'drop'
    )


wellbeing_question_summary %>%
    mutate(
        result = paste("Question type:", question,
                       "\nMean scoreVote:", mean_scoreVote,
                       "\nStdev scoreVote:", stdev_scoreVote, "\n")
    ) %>%
    pull(result) %>%
    cat(sep = "\n")
## Question type: *|COMPANY_NAME|* shows a commitment to the wellbeing of employees 
## Mean scoreVote: 3.75682637815559 
## Stdev scoreVote: 1.02379824656329 
## 
## Question type: My manager genuinely cares about my wellbeing 
## Mean scoreVote: 7.64944186629876 
## Stdev scoreVote: 2.21054326067456

Scorevote for Stress Levels

stress_question <- scoreMetadataDF %>% filter(grepl("stress", question, ignore.case = TRUE))

stress_votes <- scoreVotesDF %>% filter(questionId %in% stress_question$questionId)
# Calculate mean and standard deviation for the stress question
stress_scoreVote_mean <- mean(stress_votes$scoreVote, na.rm = TRUE)
stress_scoreVote_stdev <- sd(stress_votes$scoreVote, na.rm = TRUE)

# Print the results

cat("Stress scoreVote mean:", stress_scoreVote_mean, "\n")
## Stress scoreVote mean: 7.560678
cat("Stress scoreVote stdev:", stress_scoreVote_stdev, "\n")
## Stress scoreVote stdev: 1.959066

2. Plotting number of Companies per Industry

Use the ggplot library (not R’s built-in functions) to create a bar plot (horizontal or vertical orientation) of the number of companies per industry represented in this dataset (companyMetadata.csv), filtering for only the top 10 most frequent industries. (Hint: look through the dplyr library functions for help transforming your data set into the right set to use.) Make sure to customize the axes and title with meaningful labels, and that all text is clear and legible.

First: Filtering out for the top 10 industries

top_10_industries <- companyMetadataDF %>%
    filter(!is.na(industry) & industry != "") %>%
    group_by(industry) %>%
    summarise(company_count = n()) %>%
    arrange(desc(company_count)) %>%
    slice_head(n = 10) 

top_10_industries
## # A tibble: 10 × 2
##    industry                        company_count
##    <chr>                                   <int>
##  1 COMPUTER_SOFTWARE_IT_SERVICES              38
##  2 FINANCIAL_SERVICES_INSURANCE               15
##  3 MANUFACTURING                              13
##  4 HEALTH_CARE_SOCIAL_ASSISTANCE               8
##  5 NON_PROFIT_ORGANIZATION                     8
##  6 MANAGEMENT_CONSULTING                       6
##  7 UTILITIES_AND_SERVICES                      6
##  8 WHOLESALE_AND_RETAIL_TRADE                  6
##  9 EDUCATIONAL_SERVICES                        5
## 10 ACCOMMODATION_AND_FOOT_SERVICES             4

Creating a horizontal bar plot:

library(ggplot2)

top_10_industries_plot <- top_10_industries %>%
    mutate(industry = gsub("_", " ", industry))

ggplot(top_10_industries_plot, aes(x = industry, y = company_count)) + theme_minimal() + labs(title = "Top 10 Industries by Number of Companies", x = "Industry", y = "Number of Companies") + geom_col() + coord_flip() 

2. Distribution of ScoreVote by Score Categories

Create box plots showing distribution of scoreVote by the different score categories (i.e., scoreId). Hint: to execute this task, you’ll need to join the scoreMeta table and scoreVotes table, using the ‘name’ in the scoreMeta table as the x-axis. Again, make sure your axes, title, and labels are clear and meaningful. Give a sentence of analysis about this figure; at a high level, what does it mean/what can we learn from it? 

# Condensing to only the key score IDs, then joining with the DF
scoreMetadataDF_unique <- scoreMetadataDF %>%
    distinct(scoreId, .keep_all = TRUE)
score_data <- scoreVotesDF %>%
    inner_join(scoreMetadataDF_unique, by = "scoreId")
ggplot(score_data, aes(x = name, y = scoreVote)) +
    geom_boxplot() +
    theme_minimal() +
    labs(
        title = "Distribution of ScoreVote by Score Category",
        x = "Score Category",
        y = "Score Vote"
    ) + coord_flip() 

At a high level, we can conclude from this data that while there is a high level of variation, employees are on average quite satisfied and self-motivated for their jobs. However, they on average do not get reward and recognition, and only feel partially aligned with their work.

4. Highest Average Wellbeing Score

We use the wellbeing votes questions we had before and then join it with the company data DF

wellbeing_votes_with_company <- wellbeing_votes %>%
    inner_join(companyMetadataDF, by = "companyId")

head(wellbeing_votes_with_company, 10)
##                   companyId employeeId             departmentId
## 1  57908a2622881200033b34d7        453 5b505f30d1c5730004f2d986
## 2  57908a2622881200033b34d7        445 5ba1563430cf530004025c7e
## 3  57908a2622881200033b34d7        455 5ba1563430cf530004025c7e
## 4  57908a2622881200033b34d7        464 5ba1563430cf530004025c7e
## 5  57908a2622881200033b34d7        512 5ba1563430cf530004025c7e
## 6  57908a2622881200033b34d7        511 5ba1563430cf530004025c7e
## 7  57908a2622881200033b34d7        422 5ba1563430cf530004025c7e
## 8  57908a2622881200033b34d7        465 5ba1563430cf530004025c7e
## 9  57908a2622881200033b34d7        498 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7        464 5ba1563430cf530004025c7e
##                     scoreId                 factorId               questionId
## 1  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 2  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 3  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 4  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 5  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 6  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 7  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 8  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 9  5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
## 10 5dd6e4a49a5137000450ff1e 5dd6e4a49a5137000450ff1a 5dd6e4a49a5137000450ff1b
##                   date scoreVote                      industry      timezone
## 1  2020-01-20 00:00:00         4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 2  2019-11-25 00:00:00         4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 3  2019-12-23 00:00:00         4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 4  2020-01-29 00:00:00         4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 5  2020-01-30 00:00:00         5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 6  2020-02-11 00:00:00         1 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 7  2020-02-13 00:00:00         5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 8  2020-03-07 00:00:00         5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 9  2020-03-11 00:00:00         4 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 10 2020-05-02 00:00:00         5 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid

We then filter them by company and then calculate the mean for each. Taking the top 10, we find:

avg_wellbeing_by_company <- wellbeing_votes_with_company %>%
    group_by(companyId) %>%
    summarise(average_wellbeing_score = mean(scoreVote, na.rm = TRUE))

head(avg_wellbeing_by_company, 10)
## # A tibble: 10 × 2
##    companyId                average_wellbeing_score
##    <chr>                                      <dbl>
##  1 56aec740f1ef260003e307d6                    6.24
##  2 56fd2b64f41c670003f643c8                    5.21
##  3 57908a2622881200033b34d7                    6.46
##  4 57c4aa7dbb8b5c000396fd3b                    6.55
##  5 594980450af0930004ad2e31                    5.97
##  6 59e5e76eda4f2100041a722c                    5.33
##  7 5a5c65acda7ea50004af7996                    5.56
##  8 5a8c1d3da359580004047bcb                    6.17
##  9 5ad7d22a40bbc80004b73727                    5.35
## 10 5b3ddd08ce75e500040f8c76                    5.62

Finally sorting by wellbeing, we see the company with the highest average wellbeing score:

library(ggplot2)

top_wellbeing_company <- avg_wellbeing_by_company %>%
    slice_max(average_wellbeing_score, n = 1)

top_wellbeing_company
## # A tibble: 1 × 2
##   companyId                average_wellbeing_score
##   <chr>                                      <dbl>
## 1 5f60e1c51f5cb400047183e0                    6.61

5. Happiness Index Comparisons between Arts and Finance

Compare Happiness Index (hiVote) between the two industries ARTS_ENTERTAINMENT_RECREATION and FINANCIAL_SERVICES_INSURANCE. Write a hypothesis, predicting which of the two industries will have a higher hiVote score. Conduct a t-test to evaluate your hypothesis: report the mean hiVote for the industries and the p-value, identifying whether the results are significant based on a significance level of 0.05. Write a sentence describing the result of your statistical findings on your chosen hypothesis.

Hypothesis: I believe that the Financial Services Insurance people will on average have a lower happiness index than the Arts/Entertainment/Recreation

industry_2_hiVotes <- companyMetadataDF %>%
    filter(industry %in% c("ARTS_ENTERTAINMENT_RECREATION", "FINANCIAL_SERVICES_INSURANCE")) %>%
    inner_join(hiVotesDF, by = "companyId")

head(industry_2_hiVotes, 10)
##                   companyId                     industry      timezone
## 1  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 2  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 3  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 4  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 5  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 6  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 7  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 8  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 9  56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
## 10 56aec740f1ef260003e307d6 FINANCIAL_SERVICES_INSURANCE Europe/Madrid
##    employeeId             departmentId                date hiVote
## 1          83 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00      2
## 2          84 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00      3
## 3          79 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00      4
## 4          94 56aec8e1f1ef260003e307d8 2016-02-01 00:00:00      4
## 5          61 56aec8e1f1ef260003e307d8 2016-02-02 00:00:00      3
## 6          94 56aec8e1f1ef260003e307d8 2016-02-02 00:00:00      3
## 7          61 56aec8e1f1ef260003e307d8 2016-02-03 00:00:00      3
## 8          94 56aec8e1f1ef260003e307d8 2016-02-04 00:00:00      3
## 9          61 56aec8e1f1ef260003e307d8 2016-02-04 00:00:00      4
## 10         58 56aec8e1f1ef260003e307d8 2016-02-05 00:00:00      3

So far this seems to prove my hypothesis that the hivote of the arts industry is lower… Let’s investigate further.

t_test_results <- t.test(
    hiVote ~ industry, 
    data = industry_2_hiVotes,
)

#Printing it to look nice bcs I kind of have OCD
t_statistic <- t_test_results$statistic
p_value <- t_test_results$p.value
mean_arts_entertainment <- t_test_results$estimate[1]
mean_financial_services <- t_test_results$estimate[2]
confidence_interval <- t_test_results$conf.int

# Format and print the results
cat("T-Test Results:\n")
## T-Test Results:
cat("---------------------------\n")
## ---------------------------
cat("Mean hiVote for ARTS_ENTERTAINMENT_RECREATION:", round(mean_arts_entertainment, 2), "\n")
## Mean hiVote for ARTS_ENTERTAINMENT_RECREATION: 3.37
cat("Mean hiVote for FINANCIAL_SERVICES_INSURANCE:", round(mean_financial_services, 2), "\n")
## Mean hiVote for FINANCIAL_SERVICES_INSURANCE: 3.04
cat("T-Statistic:", round(t_statistic, 2), "\n")
## T-Statistic: 2.44
cat("P-Value:", format.pval(p_value, digits = 4), "\n")
## P-Value: 0.01829
cat("95% Confidence Interval:", round(confidence_interval[1], 2), "to", round(confidence_interval[2], 2), "\n")
## 95% Confidence Interval: 0.06 to 0.59

Interpreting this T-Test:

  • We find that on average, companies in Arts report a higher happiness index compared to the financial services industry. With a T-statistic of 2.44, we also see that relative to the variability of the data, this is a significant difference of happiness as well.

  • We also find that the p-value is 0.018 < 0.05, meaning that this result is statistically significant.

  • With a 95% confident interval, we find that the difference in happiness values can be anywhere from 0.06 to 0.59, meaning that we are confident that the arts sector is happier, even if it’s by a small amount.

6. Which Company has the Happiest Employees?

The most appealing/intuitive approach to this question is just to just reference the happiness Index:

company_hiVotes <- companyMetadataDF %>%
    inner_join(hiVotesDF, by = "companyId")

head(company_hiVotes, 10)
##                   companyId                      industry      timezone
## 1  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 2  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 3  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 4  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 5  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 6  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 7  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 8  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 9  57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
## 10 57908a2622881200033b34d7 COMPUTER_SOFTWARE_IT_SERVICES Europe/Madrid
##    employeeId             departmentId                date hiVote
## 1         461 57df9888838c0d0003e2abae 2021-01-27 00:00:00      4
## 2         354 57df9888838c0d0003e2abae 2021-01-28 00:00:00      4
## 3         461 57df9888838c0d0003e2abae 2021-01-28 00:00:00      4
## 4         461 57df9888838c0d0003e2abae 2021-01-29 00:00:00      4
## 5         461 57df9888838c0d0003e2abae 2021-02-01 00:00:00      4
## 6         354 57df9888838c0d0003e2abae 2021-02-02 00:00:00      4
## 7         461 57df9888838c0d0003e2abae 2021-02-02 00:00:00      4
## 8           8 58f616f0ca49f6000474c546 2017-04-18 00:00:00      2
## 9           8 58f616f0ca49f6000474c546 2017-04-19 00:00:00      4
## 10         77 58f616f0ca49f6000474c546 2017-04-20 00:00:00      4
# Calculate the mean hiVote for each company
mean_hiVote_by_company <- company_hiVotes %>%
    group_by(companyId) %>%
    summarise(mean_hiVote = mean(hiVote, na.rm = TRUE))

head(mean_hiVote_by_company, 10)
## # A tibble: 10 × 2
##    companyId                mean_hiVote
##    <chr>                          <dbl>
##  1 56aec740f1ef260003e307d6        3.21
##  2 56fd2b64f41c670003f643c8        2.70
##  3 574c423856b6300003009953        3.17
##  4 57908a2622881200033b34d7        3.49
##  5 57ac8b23be7fe30003e656d0        2.81
##  6 57c01f7e8e0e090003ab5955        2.97
##  7 57c4aa7dbb8b5c000396fd3b        3.30
##  8 58fe01b88028ec0004a54105        2.60
##  9 590047cd5a640900042eb54e        2.68
## 10 59439923a74bf90004bfc467        2.64
top_hi_company <- mean_hiVote_by_company %>%
    slice_max(mean_hiVote, n = 1)

top_hi_company
## # A tibble: 1 × 2
##   companyId                mean_hiVote
##   <chr>                          <dbl>
## 1 60a37307fe5648659c7abf25        3.67

Here, we find the “happiest” company (depending on if we define happiness by Happiness Index).

7. How would analyzing the data differently result in a different finding?

For the company you chose in the previous question, show how analyzing the data differently would give you a different finding. Others might believe that the recommendation to work at a company is the most important factor - employees will only recommend others to work at a company that they’re happy working at.

recommend_question <- scoreMetadataDF %>% filter(grepl("likely are you to recommend", question, ignore.case = TRUE))


recommend_votes <- scoreVotesDF %>% filter(questionId %in% recommend_question$questionId)

head(recommend_votes, 10)
##                   companyId employeeId             departmentId
## 1  57908a2622881200033b34d7        256 5b505f30d1c5730004f2d986
## 2  57908a2622881200033b34d7        429 5b505f30d1c5730004f2d986
## 3  57908a2622881200033b34d7        471 5b505f30d1c5730004f2d986
## 4  57908a2622881200033b34d7        467 5b505f30d1c5730004f2d986
## 5  57908a2622881200033b34d7        508 5b505f30d1c5730004f2d986
## 6  57908a2622881200033b34d7        503 5b505f30d1c5730004f2d986
## 7  57908a2622881200033b34d7        521 5b505f30d1c5730004f2d986
## 8  57908a2622881200033b34d7        530 5b505f30d1c5730004f2d986
## 9  57908a2622881200033b34d7        251 5ba1563430cf530004025c7e
## 10 57908a2622881200033b34d7        263 5ba1563430cf530004025c7e
##                     scoreId                 factorId               questionId
## 1  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 2  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 3  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 4  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 5  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 6  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 7  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 8  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 9  5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
## 10 5bfff05de6caa5000429c544 5bfff05de6caa5000429c542 5bfff05de6caa5000429c543
##                   date scoreVote
## 1  2018-12-19 00:00:00         8
## 2  2019-09-10 00:00:00        10
## 3  2019-12-12 00:00:00        10
## 4  2019-12-13 00:00:00         8
## 5  2020-03-04 00:00:00         8
## 6  2020-03-04 00:00:00        10
## 7  2020-03-05 00:00:00        10
## 8  2020-03-18 00:00:00        10
## 9  2018-12-09 00:00:00        10
## 10 2018-12-27 00:00:00         8

We then look at the top 10 companies by recommender score:

top_hi_company_id <- top_hi_company$companyId


recommend_score <- scoreVotesDF %>%
    group_by(companyId) %>%
    summarise(mean_rec_vote = mean(scoreVote, na.rm = TRUE)) %>%
    arrange(desc(mean_rec_vote)) %>%
    slice_head(n = 10) 

head(recommend_score, 10)
## # A tibble: 10 × 2
##    companyId                mean_rec_vote
##    <chr>                            <dbl>
##  1 5c5031fd2e9d970004262a20          8.75
##  2 5c73c78750b72e0004cab5c9          8.41
##  3 574c423856b6300003009953          8.31
##  4 5c54173ba6705a0004651d6d          8.17
##  5 5fa00a76f553b600042d6011          7.73
##  6 60113050f50ef100042b1494          7.31
##  7 5dee8369418c2c0004fc6f4e          7.21
##  8 5f60e1c51f5cb400047183e0          7.13
##  9 60770c99f17ad90003452a13          7.01
## 10 60a37307fe5648659c7abf25          7

Here, we find that the company we had previously identified scores 10th on whether others would recommend them or not.

8. HI and Industry

a. Run a one-way ANOVA to determine whether significant differences in mean hiVote exist across industries. Report the p-value and interpret the results.

joined_df <- hiVotesDF %>%
  inner_join(companyMetadataDF, by = "companyId")

industry_hiVotes <- joined_df %>%
  filter(!is.na(hiVote)) %>%  
  group_by(industry) %>%
  summarise(mean_hiVote = mean(hiVote, na.rm = TRUE))


# Calculating ANOVA
anova_results <- aov(hiVote ~ industry, data = joined_df, na.action = na.omit)
# Formatting the summary because I have OCD
anova_summary <- summary(anova_results)[[1]]

degrees_freedom_industry <- anova_summary$Df[1]
degrees_freedom_residual <- anova_summary$Df[2]
sum_sq_industry <- anova_summary$`Sum Sq`[1]
sum_sq_residual <- anova_summary$`Sum Sq`[2]
mean_sq_industry <- anova_summary$`Mean Sq`[1]
mean_sq_residual <- anova_summary$`Mean Sq`[2]
f_value <- anova_summary$`F value`[1]
p_value <- anova_summary$`Pr(>F)`[1]

cat("One-Way ANOVA Results:\n")
## One-Way ANOVA Results:
cat("---------------------------\n")
## ---------------------------
cat("Degrees of Freedom (Industry):", degrees_freedom_industry, "\n")
## Degrees of Freedom (Industry): 15
cat("Degrees of Freedom (Residuals):", degrees_freedom_residual, "\n")
## Degrees of Freedom (Residuals): 2302342
cat("Sum of Squares (Industry):", round(sum_sq_industry, 2), "\n")
## Sum of Squares (Industry): 34824.46
cat("Sum of Squares (Residuals):", round(sum_sq_residual, 2), "\n")
## Sum of Squares (Residuals): 2176257
cat("Mean Square (Industry):", round(mean_sq_industry, 2), "\n")
## Mean Square (Industry): 2321.63
cat("Mean Square (Residuals):", round(mean_sq_residual, 2), "\n")
## Mean Square (Residuals): 0.95
cat("F-Statistic:", round(f_value, 2), "\n")
## F-Statistic: 2456.14
cat("P-Value:", format.pval(p_value, digits = 4), "\n")
## P-Value: < 2.2e-16

Interpreting the results of this: from the sum of squares values, we see that the variability between and within industries is quite significant, meaning that while industry might explain some of the differences in employees’ happiness index, it’s not the full picture. However, looking at the extremely small p value, we can still conclude that industry is a strong and significant predictor of happiness index.

b. Imagine (regardless of the outcome of the previous step) that your results indicate a statistically significant difference. In class, we’ve discussed strategies for post-hoc analysis, including interpreting visualizations and conducting post-hoc testing. Suppose you simply ran Tukey’s test as your post-hoc analysis. How many comparisons would that statistical test be making? 

I honestly didn’t know what Tukey’s test was. From my understanding (from the internet), it makes pairwise comparisons between all possible pairs of industries’ HI to identify which specific industry HI means would differ significantly. Because it takes every pair, this means that this would run \(\binom n2\) times. Since we have 15 industries, this would be \(\binom {15} 2=105\) comparisons.

c. In a sentence or two, what might be a better analysis step?

Looking at our data, we aren’t given equal variances between groups - meaning that we might use the Games-Howell test instead. While it is more complex, it also accounts for the fact that the variance in HI might differ between industries, since all industries are not built equally.

9. Running a Linear Regression on scoreVote based on time zone

Preparing the merged timezone df by company ID

timezone_df <- scoreVotesDF %>%
    inner_join(companyMetadataDF, by = "companyId")


# Print the first few rows of the joined data frame
unique(timezone_df$timezone)
##  [1] "Europe/Madrid"                  "America/Guayaquil"             
##  [3] "America/Mexico_City"            "America/Sao_Paulo"             
##  [5] "GMT"                            "Europe/Berlin"                 
##  [7] "Pacific/Galapagos"              "Europe/London"                 
##  [9] "America/Guatemala"              "America/Bogota"                
## [11] "America/Santiago"               "Europe/Luxembourg"             
## [13] "America/Argentina/Buenos_Aires"
timezone_df$scoreVote <- scale(timezone_df$scoreVote, center = TRUE, scale = FALSE) #Scaling by timezone to eliminate intercept
lin_reg <- lm(scoreVote ~ timezone, data = timezone_df)
summary(lin_reg)
## 
## Call:
## lm(formula = scoreVote ~ timezone, data = timezone_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9658 -2.5307  0.4693  2.4693  4.8389 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  0.04488    0.06591   0.681 0.495929    
## timezoneAmerica/Bogota      -0.01668    0.08257  -0.202 0.839922    
## timezoneAmerica/Guatemala    0.67738    0.06958   9.736  < 2e-16 ***
## timezoneAmerica/Guayaquil    0.69128    0.06793  10.176  < 2e-16 ***
## timezoneAmerica/Mexico_City  0.25625    0.06692   3.829 0.000129 ***
## timezoneAmerica/Santiago     0.43746    0.08244   5.307 1.12e-07 ***
## timezoneAmerica/Sao_Paulo    0.13692    0.06846   2.000 0.045495 *  
## timezoneEurope/Berlin       -0.13062    0.06774  -1.928 0.053813 .  
## timezoneEurope/London       -1.11338    0.07171 -15.525  < 2e-16 ***
## timezoneEurope/Luxembourg    0.20144    0.10736   1.876 0.060616 .  
## timezoneEurope/Madrid       -0.17735    0.06609  -2.683 0.007292 ** 
## timezoneGMT                 -0.06100    0.06896  -0.885 0.376408    
## timezonePacific/Galapagos    0.12552    0.55004   0.228 0.819482    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.73 on 495911 degrees of freedom
## Multiple R-squared:  0.0122, Adjusted R-squared:  0.01217 
## F-statistic: 510.3 on 12 and 495911 DF,  p-value: < 2.2e-16

Now extracting the strongest effect by finding the largest magnitude coefficient:

coefficients_df <- summary(lin_reg)$coefficients
coefficients_df <- as.data.frame(coefficients_df)
strongest_effect <- coefficients_df[which.max(abs(coefficients_df$Estimate)), ]

strongest_effect
##                        Estimate Std. Error   t value     Pr(>|t|)
## timezoneEurope/London -1.113379 0.07171362 -15.52534 2.407474e-54
# Identify the most significant effect (smallest p-value)
most_significant_effect <- coefficients_df[which.min(coefficients_df$`Pr(>|t|)`), ]

most_significant_effect
##                        Estimate Std. Error   t value     Pr(>|t|)
## timezoneEurope/London -1.113379 0.07171362 -15.52534 2.407474e-54

It appears that London has the strongest and the most significant effect. This is a bit confusing at first, but then looking at the datapoints, we find:

timezone_counts <- timezone_df %>%
    group_by(timezone) %>%
    summarise(count = n()) %>%
    arrange(desc(count))

head(timezone_counts, 10)
## # A tibble: 10 × 2
##    timezone             count
##    <chr>                <int>
##  1 Europe/Madrid       308876
##  2 America/Mexico_City  55788
##  3 Europe/Berlin        30572
##  4 America/Guayaquil    27592
##  5 America/Sao_Paulo    21777
##  6 GMT                  18133
##  7 America/Guatemala    15016
##  8 Europe/London         9336
##  9 America/Santiago      3041
## 10 America/Bogota        3014

Effect size and statistical significance are not really related to one another, as regardless of large effect size, the data can still be statistically insignificant if the sample size is small. Conversely, a small effect size can be statistically significant if the sample size is large enough.

Part 2: Reflecting on Data Collection and Analysis

Q3) Do you think Happyforce reliably measures employee satisfaction?

Thesis: I don’t think the happiness index sufficiently captures employee satisfaction. In my opinion, a more important predictor of employee satisfaction is whether they would recommend working at the company to others.

Evidence 1) The data is a bit flawed in that there are not an equal amount of respondents per company, possibly skewing the mean HI score. Take the top two scores in the piece below, for example:

# Calculate the mean hiVote for each company
mean_hiVote_by_company_and_headcount <- company_hiVotes %>%
    group_by(companyId) %>%
    summarise(mean_hiVote = mean(hiVote, na.rm = TRUE),
              response_count = n())  %>%   
  arrange(desc(mean_hiVote))

head(mean_hiVote_by_company_and_headcount, 10)
## # A tibble: 10 × 3
##    companyId                mean_hiVote response_count
##    <chr>                          <dbl>          <int>
##  1 60a37307fe5648659c7abf25        3.67              3
##  2 5fa00a76f553b600042d6011        3.58             26
##  3 60113050f50ef100042b1494        3.57            185
##  4 5f60e1c51f5cb400047183e0        3.54           6138
##  5 6058cd9eadd6f80003319249        3.52            419
##  6 5bb3503c375232000426c410        3.50          31467
##  7 57908a2622881200033b34d7        3.49           7146
##  8 602e81c1d3548400031f48a6        3.42           4481
##  9 5faee32997df24000430d586        3.39           9217
## 10 6047ac8b55de6f000456ca5d        3.37             49

One might argue that 6138 employees testifying a mean high vote of 3.54 is more significant than 3 employees giving a mean high vote of 3.7.

Evidence 2) Simply taking the mean hiVote does not take into account the variance in responses.

mean_hiVote_by_company_with_variance <- company_hiVotes %>%
    group_by(companyId) %>%
    summarise(mean_hiVote = mean(hiVote, na.rm = TRUE),
               variance = var(hiVote, na.rm = TRUE))  %>%   
  arrange(desc(mean_hiVote))

head(mean_hiVote_by_company_with_variance, 10)
## # A tibble: 10 × 3
##    companyId                mean_hiVote variance
##    <chr>                          <dbl>    <dbl>
##  1 60a37307fe5648659c7abf25        3.67    0.333
##  2 5fa00a76f553b600042d6011        3.58    0.254
##  3 60113050f50ef100042b1494        3.57    0.311
##  4 5f60e1c51f5cb400047183e0        3.54    0.451
##  5 6058cd9eadd6f80003319249        3.52    0.475
##  6 5bb3503c375232000426c410        3.50    0.450
##  7 57908a2622881200033b34d7        3.49    0.509
##  8 602e81c1d3548400031f48a6        3.42    0.562
##  9 5faee32997df24000430d586        3.39    0.507
## 10 6047ac8b55de6f000456ca5d        3.37    0.862

Again, the company with the highest mean hiVote also has more variance than the second highest company. Even if the second highest company is a bit less happy (3.57 as opposed to 3.67), if the variance is lower that also means that employees are more uniformly this level of happy.

Evidence 3) Mean HiVote alone fails to capture happiness over time. While this is just a toy example, it might be useful to the company to view the happiness of their employees over a set amount of time, instead of overall.

# Identify employee IDs with more than >1000 responses to properly identify trends
multi_response_ids <- hiVotesDF %>%
  count(employeeId) %>%
  filter(n > 10000) %>%
  pull(employeeId)
# Filtering only for employees with sufficient datapoints
filtered_hiVotes <- hiVotesDF %>%
  mutate(date = as.Date(date)) %>%  # Date so we can space shit out later
  filter(employeeId %in% multi_response_ids) %>%
  arrange(desc(employeeId))

head(filtered_hiVotes, 10)
##                   companyId employeeId             departmentId       date
## 1  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-19
## 2  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-20
## 3  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-21
## 4  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-22
## 5  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-23
## 6  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-24
## 7  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-25
## 8  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-26
## 9  57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-27
## 10 57c4aa7dbb8b5c000396fd3b         24 57df9888838c0d0003e2abae 2016-09-29
##    hiVote
## 1       4
## 2       4
## 3       4
## 4       4
## 5       4
## 6       4
## 7       4
## 8       4
## 9       4
## 10      4
# Group them by month's average
hiVote_by_month <- filtered_hiVotes %>%
  mutate(month = floor_date(date, unit = "month")) %>%  
  group_by(employeeId, month) %>%
  summarise(mean_hiVote = mean(hiVote, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'employeeId'. You can override using the
## `.groups` argument.
# Visualize trend using a line graph
ggplot(hiVote_by_month, aes(x = month, y = mean_hiVote, color = as.factor(employeeId), group = employeeId)) +
  geom_line() +
  geom_point() +  
  labs(
    title = "Trend of Mean hiVote Over Time for Select Employees",
    x = "Month",
    y = "Mean HI",
    color = "Employee ID"
  ) +
  scale_x_date(
    date_breaks = "1 year",  
    date_labels = "%b %Y"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Q4) Alternate Scenarios

a) Imagine you are a Nefarious company that wanted to prove its employees were the happiest. How could you do this?

company_data <- filtered_hiVotes %>%
  filter(companyId == "5bb3503c375232000426c410") 

mean_happiness_by_day <- company_data %>%
  group_by(date) %>%
  summarise(mean_happiness = mean(hiVote, na.rm = TRUE)) %>%
  ungroup()

head(mean_happiness_by_day, 10)
## # A tibble: 10 × 2
##    date       mean_happiness
##    <date>              <dbl>
##  1 2018-10-23           4   
##  2 2018-10-24           4   
##  3 2018-10-25           3.5 
##  4 2018-10-26           3   
##  5 2018-10-27           3   
##  6 2018-10-29           3   
##  7 2018-10-30           3.67
##  8 2018-11-01           2   
##  9 2018-11-05           3   
## 10 2018-11-06           4

Look! My employees are so happy (on specific dates!)

a) Imagine you are a Company rival. How would you prove that everyone hates their job?

In a similar way, we could focus on departments with particularly bad HI scores. Take the department that scores 1.00, for example. If everyone in this department we cherrypicked hates their job surely everyone at this company hates their jobs just as much.

department_happiness <- filtered_hiVotes %>%
  group_by(departmentId) %>%
  summarise(mean_happiness = mean(hiVote, na.rm = TRUE)) %>%
  ungroup()

head(department_happiness, 10)
## # A tibble: 10 × 2
##    departmentId             mean_happiness
##    <chr>                             <dbl>
##  1 56aec900f1ef260003e307d9           3.07
##  2 576457853d77930003a4db82           3.30
##  3 576910af887958000317c79b           3.69
##  4 5769115c887958000317c7a1           3.98
##  5 57691165887958000317c7a2           3.95
##  6 57c01ffc8e0e090003ab5958           1   
##  7 57c020018e0e090003ab5959           3.07
##  8 57df9888838c0d0003e2abae           3.72
##  9 58f61705ca49f6000474c54a           2.93
## 10 58ffc0376bf1b70004366aa0           2.23

Q5) Self-chosen question

What company is the most uniform, culturally? (You can try and prove this by determining company has the least variation across departments)

department_variance <- hiVotesDF %>%
  group_by(companyId, departmentId) %>%
  summarise(mean_happiness = mean(hiVote, na.rm = TRUE), .groups = "drop")

head(department_variance, 10)
## # A tibble: 10 × 3
##    companyId                departmentId             mean_happiness
##    <chr>                    <chr>                             <dbl>
##  1 56aec740f1ef260003e307d6 56aec8e1f1ef260003e307d8           3.33
##  2 56aec740f1ef260003e307d6 56aec900f1ef260003e307d9           3.16
##  3 56aec740f1ef260003e307d6 56cbdc1c761e8400033baed4           3.08
##  4 56aec740f1ef260003e307d6 56cbdc52761e8400033baed5           3.40
##  5 56aec740f1ef260003e307d6 576457853d77930003a4db82           3.23
##  6 56aec740f1ef260003e307d6 576457993d77930003a4db83           3.34
##  7 56aec740f1ef260003e307d6 576457ae3d77930003a4db84           2.91
##  8 56aec740f1ef260003e307d6 5a9406994763b40004906e0e           3.57
##  9 56fd2b64f41c670003f643c8 59358519497fd300040ea898           2.59
## 10 56fd2b64f41c670003f643c8 59358523497fd300040ea899           2.62
company_variation <- department_variance %>%
  group_by(companyId) %>%
  summarise(variance = var(mean_happiness, na.rm = TRUE), .groups = "drop") %>%
  arrange(variance)  

head(company_variation, 10)
## # A tibble: 10 × 2
##    companyId                variance
##    <chr>                       <dbl>
##  1 6101220e42e92f2213502851  0.00207
##  2 5aaaafad4ddefd000481094f  0.00278
##  3 60c7297b4bcbc001be879aad  0.00428
##  4 5e6776f6d72b3700047d261d  0.00605
##  5 5f9999e442e9f500044d7177  0.0146 
##  6 60770c99f17ad90003452a13  0.0148 
##  7 5a96be30aa68f4000423a2c5  0.0188 
##  8 60e577836e505a4c371d7bf3  0.0194 
##  9 5ad7d22a40bbc80004b73727  0.0201 
## 10 5bb3503c375232000426c410  0.0219