Part 1: Computing Quantitative Measures
Question 1
hiVotes <- read.csv("hiVotes.csv")
scoreVotes <- read.csv("scoreVotes.csv")
scoreMetadata <- read.csv("scoreMetadata.csv")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hiVote_stats <- hiVotes %>%
summarize(mean = round(mean(hiVote, na.rm = TRUE), 2), sd = round(sd(hiVote, na.rm = TRUE), 2))
print("hiVote Statistics")
## [1] "hiVote Statistics"
print(hiVote_stats)
## mean sd
## 1 2.92 0.98
scoreVote_stats <- scoreVotes %>%
summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))
print("scoreVote Statistics")
## [1] "scoreVote Statistics"
print(scoreVote_stats)
## mean sd
## 1 6.23 2.75
wellbeing_scores <- scoreVotes %>%
inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
filter(name == "Wellbeing") %>%
summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))
print("Wellbeing Score Statistics")
## [1] "Wellbeing Score Statistics"
print(wellbeing_scores)
## mean sd
## 1 6.3 2.69
stress_scores <- scoreVotes %>%
inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
filter(question == "On a scale from 1 to 10, how would you rate the work-related stress?") %>%
summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))
print("Work-related Stress Score Statistics")
## [1] "Work-related Stress Score Statistics"
print(stress_scores)
## mean sd
## 1 7.56 1.96
Question 2
library(ggplot2)
companyMetadata <- read.csv("companyMetadata.csv")
top_industries <- companyMetadata %>%
filter(!is.na(industry), industry != "") %>%
group_by(industry) %>%
summarize(company_count = n()) %>%
arrange(desc(company_count)) %>%
slice_head(n = 10)
ggplot(top_industries, aes(x = reorder(industry, company_count), y = company_count)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Industries by Number of Companies", x = "Industry", y = "Number of Companies")

Question 3
score_data <- scoreVotes %>%
inner_join(scoreMetadata, by = c("scoreId", "questionId"))
ggplot(score_data, aes(x = name, y = scoreVote, color = name)) +
geom_boxplot(outlier.shape = 16) +
labs(title = "Distribution of Score Votes by Score Category", x = "Score Category", y = "Score Vote") +
theme(axis.text.x = element_blank())

Question 4
wellbeing_scores <- scoreVotes %>%
inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
filter(name == "Wellbeing")
company_wellbeing_scores <- wellbeing_scores %>%
inner_join(companyMetadata, by = "companyId") %>%
group_by(companyId) %>%
summarize(average_wellbeing_score = mean(scoreVote, na.rm = TRUE)) %>%
arrange(desc(average_wellbeing_score))
top_company <- company_wellbeing_scores %>%
slice_head(n = 1)
print(top_company)
## # A tibble: 1 × 2
## companyId average_wellbeing_score
## <chr> <dbl>
## 1 60a37307fe5648659c7abf25 8
Question 5
industry_hiVotes <- hiVotes %>%
inner_join(companyMetadata, by = "companyId") %>%
filter(industry == "ARTS_ENTERTAINMENT_RECREATION" | industry == "FINANCIAL_SERVICES_INSURANCE")
mean_hiVotes <- industry_hiVotes %>%
group_by(industry) %>%
summarize(mean_hiVote = round(mean(hiVote, na.rm = TRUE), 2))
print("Mean hiVote for each industry:")
## [1] "Mean hiVote for each industry:"
print(mean_hiVotes)
## # A tibble: 2 × 2
## industry mean_hiVote
## <chr> <dbl>
## 1 ARTS_ENTERTAINMENT_RECREATION 3.37
## 2 FINANCIAL_SERVICES_INSURANCE 3.04
arts_entertainment_recreation <- industry_hiVotes %>%
filter(industry == "ARTS_ENTERTAINMENT_RECREATION") %>%
pull(hiVote)
financial_services_insurance <- industry_hiVotes %>%
filter(industry == "FINANCIAL_SERVICES_INSURANCE") %>%
pull(hiVote)
t_test_result <- t.test(arts_entertainment_recreation, financial_services_insurance, var.equal = TRUE)
print(t_test_result)
##
## Two Sample t-test
##
## data: arts_entertainment_recreation and financial_services_insurance
## t = 2.3825, df = 793616, p-value = 0.0172
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05747963 0.59073657
## sample estimates:
## mean of x mean of y
## 3.367347 3.043239
Question 6
company_happiness <- scoreVotes %>%
group_by(companyId) %>%
summarize(average_scoreVote = mean(scoreVote, na.rm = TRUE)) %>%
arrange(desc(average_scoreVote))
top_company <- company_happiness %>%
slice_head(n = 1)
print("Top company")
## [1] "Top company"
print(top_company)
## # A tibble: 1 × 2
## companyId average_scoreVote
## <chr> <dbl>
## 1 5c5031fd2e9d970004262a20 8.75
Question 7
happiness_threshold <- 8
company_happiness <- scoreVotes %>%
inner_join(companyMetadata, by = "companyId") %>%
group_by(companyId) %>%
summarize(employee_count = n(), highly_satisfied = sum(scoreVote >= happiness_threshold, na.rm = TRUE)) %>%
mutate(percentage_highly_satisfied = (highly_satisfied / employee_count) * 100) %>%
arrange(desc(percentage_highly_satisfied))
top_company <- company_happiness %>%
slice_head(n = 1)
print("Top company")
## [1] "Top company"
print(top_company)
## # A tibble: 1 × 4
## companyId employee_count highly_satisfied percentage_highly_sa…¹
## <chr> <int> <int> <dbl>
## 1 5c73c78750b72e0004cab5… 575 453 78.8
## # ℹ abbreviated name: ¹​percentage_highly_satisfied
previous_top_company <- company_happiness %>%
filter(companyId == "5c5031fd2e9d970004262a20")
print("Previous top company")
## [1] "Previous top company"
print(previous_top_company)
## # A tibble: 1 × 4
## companyId employee_count highly_satisfied percentage_highly_sa…¹
## <chr> <int> <int> <dbl>
## 1 5c5031fd2e9d970004262a… 4 3 75
## # ℹ abbreviated name: ¹​percentage_highly_satisfied
Question 8
hiVotes_industry <- hiVotes %>%
inner_join(companyMetadata, by = "companyId") %>%
select(hiVote, industry)
anova_summary <- summary(aov(hiVote ~ industry, data = hiVotes_industry))
print(anova_summary)
## Df Sum Sq Mean Sq F value Pr(>F)
## industry 15 34824 2321.6 2456 <2e-16 ***
## Residuals 2302342 2176257 0.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Question 9
score_timeZone <- scoreVotes %>%
inner_join(companyMetadata, by = "companyId") %>%
select(scoreVote, timezone)
summary(lm(scoreVote ~ timezone, data = score_timeZone))
##
## Call:
## lm(formula = scoreVote ~ timezone, data = score_timeZone)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9658 -2.5307 0.4693 2.4693 4.8389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.27448 0.06591 95.195 < 2e-16 ***
## timezoneAmerica/Bogota -0.01668 0.08257 -0.202 0.839922
## timezoneAmerica/Guatemala 0.67738 0.06958 9.736 < 2e-16 ***
## timezoneAmerica/Guayaquil 0.69128 0.06793 10.176 < 2e-16 ***
## timezoneAmerica/Mexico_City 0.25625 0.06692 3.829 0.000129 ***
## timezoneAmerica/Santiago 0.43746 0.08244 5.307 1.12e-07 ***
## timezoneAmerica/Sao_Paulo 0.13692 0.06846 2.000 0.045495 *
## timezoneEurope/Berlin -0.13062 0.06774 -1.928 0.053813 .
## timezoneEurope/London -1.11338 0.07171 -15.525 < 2e-16 ***
## timezoneEurope/Luxembourg 0.20144 0.10736 1.876 0.060616 .
## timezoneEurope/Madrid -0.17735 0.06609 -2.683 0.007292 **
## timezoneGMT -0.06100 0.06896 -0.885 0.376408
## timezonePacific/Galapagos 0.12552 0.55004 0.228 0.819482
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.73 on 495911 degrees of freedom
## Multiple R-squared: 0.0122, Adjusted R-squared: 0.01217
## F-statistic: 510.3 on 12 and 495911 DF, p-value: < 2.2e-16
Part 2: Reflecting on Data Collection and Analysis
Question 4a
hiVotes_company <- hiVotes %>%
filter(companyId == "5f4798584672f900046c1329")
ggplot(data = hiVotes_company, aes(x = hiVote)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
scale_y_continuous(limits = c(0, max(table(hiVotes_company$hiVote)))) +
labs(title = "Employee Satisfaction Distribution for 5f4798584672f900046c1329",
x = "hiVote", y = "Frequency")

mean_hiVote <- round(mean(hiVotes_company$hiVote, na.rm = TRUE), 2)
print(paste("Mean hiVote:", mean_hiVote))
## [1] "Mean hiVote: 2.54"
mode_hiVote <- mode(hiVotes_company$hiVote)
print(paste("Mode hiVote:", mode_hiVote))
## [1] "Mode hiVote: numeric"
Question 4b
hiVotes_company <- hiVotes %>%
filter(companyId == "5a5c65acda7ea50004af7996")
ggplot(data = hiVotes_company, aes(x = hiVote)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
scale_y_continuous(limits = c(0, max(table(hiVotes_company$hiVote)))) +
labs(title = "Employee Satisfaction Distribution for 5a5c65acda7ea50004af7996",
x = "hiVote", y = "Frequency")

low_hiVotes_count <- hiVotes_company %>%
filter(hiVote == 1 | hiVote == 2) %>%
count()
print(paste("Number of 1 or 2 hiVote scores:", low_hiVotes_count))
## [1] "Number of 1 or 2 hiVote scores: 9842"
high_hiVotes_count <- hiVotes_company %>%
filter(hiVote == 3 | hiVote == 4) %>%
count()
print(paste("Number of 3 or 4 hiVote scores:", high_hiVotes_count))
## [1] "Number of 3 or 4 hiVote scores: 37003"
mean_hiVote <- round(mean(hiVotes_company$hiVote, na.rm = TRUE), 2)
print(paste("Mean hiVote:", mean_hiVote))
## [1] "Mean hiVote: 3.11"
Question 5
department_happiness <- scoreVotes %>%
group_by(departmentId) %>%
summarize(average_scoreVote = mean(scoreVote, na.rm = TRUE)) %>%
arrange(desc(average_scoreVote))
top_departments <- department_happiness %>% slice_head(n = 5)
ggplot(top_departments, aes(x = reorder(departmentId, average_scoreVote), y = average_scoreVote)) +
geom_bar(stat = "identity", fill = "blue", color = "black") +
coord_flip() +
labs(title = "Top 5 Departments with the Highest Average scoreVote", x = "Department", y = "Average scoreVote")
