R Depth Assignment

Part 1: Computing Quantitative Measures

Question 1

hiVotes <- read.csv("hiVotes.csv")
scoreVotes <- read.csv("scoreVotes.csv")
scoreMetadata <- read.csv("scoreMetadata.csv")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

hiVote_stats <- hiVotes %>%
  summarize(mean = round(mean(hiVote, na.rm = TRUE), 2), sd = round(sd(hiVote, na.rm = TRUE), 2))

print("hiVote Statistics")

## [1] "hiVote Statistics"

print(hiVote_stats)

##   mean   sd
## 1 2.92 0.98

scoreVote_stats <- scoreVotes %>%
  summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))

print("scoreVote Statistics")

## [1] "scoreVote Statistics"

print(scoreVote_stats)

##   mean   sd
## 1 6.23 2.75

wellbeing_scores <- scoreVotes %>%
  inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
  filter(name == "Wellbeing") %>%
  summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))

print("Wellbeing Score Statistics")

## [1] "Wellbeing Score Statistics"

print(wellbeing_scores)

##   mean   sd
## 1  6.3 2.69

stress_scores <- scoreVotes %>%
  inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
  filter(question == "On a scale from 1 to 10, how would you rate the work-related stress?") %>%
  summarize(mean = round(mean(scoreVote, na.rm = TRUE), 2), sd = round(sd(scoreVote, na.rm = TRUE), 2))

print("Work-related Stress Score Statistics")

## [1] "Work-related Stress Score Statistics"

print(stress_scores)

##   mean   sd
## 1 7.56 1.96

Question 2

library(ggplot2)

companyMetadata <- read.csv("companyMetadata.csv")

top_industries <- companyMetadata %>%
  filter(!is.na(industry), industry != "") %>%
  group_by(industry) %>%
  summarize(company_count = n()) %>%
  arrange(desc(company_count)) %>%
  slice_head(n = 10)

ggplot(top_industries, aes(x = reorder(industry, company_count), y = company_count)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Industries by Number of Companies", x = "Industry", y = "Number of Companies")

Question 3

score_data <- scoreVotes %>%
  inner_join(scoreMetadata, by = c("scoreId", "questionId"))

ggplot(score_data, aes(x = name, y = scoreVote, color = name)) +
  geom_boxplot(outlier.shape = 16) +
  labs(title = "Distribution of Score Votes by Score Category", x = "Score Category", y = "Score Vote") +
  theme(axis.text.x = element_blank())

Question 4

wellbeing_scores <- scoreVotes %>%
  inner_join(scoreMetadata, by = c("scoreId", "questionId")) %>%
  filter(name == "Wellbeing")

company_wellbeing_scores <- wellbeing_scores %>%
  inner_join(companyMetadata, by = "companyId") %>%
  group_by(companyId) %>%  
  summarize(average_wellbeing_score = mean(scoreVote, na.rm = TRUE)) %>%
  arrange(desc(average_wellbeing_score))

top_company <- company_wellbeing_scores %>% 
  slice_head(n = 1)
print(top_company)

## # A tibble: 1 × 2
##   companyId                average_wellbeing_score
##   <chr>                                      <dbl>
## 1 60a37307fe5648659c7abf25                       8

Question 5

industry_hiVotes <- hiVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  filter(industry == "ARTS_ENTERTAINMENT_RECREATION" | industry == "FINANCIAL_SERVICES_INSURANCE")

mean_hiVotes <- industry_hiVotes %>%
  group_by(industry) %>%
  summarize(mean_hiVote = round(mean(hiVote, na.rm = TRUE), 2))

print("Mean hiVote for each industry:")

## [1] "Mean hiVote for each industry:"

print(mean_hiVotes)

## # A tibble: 2 × 2
##   industry                      mean_hiVote
##   <chr>                               <dbl>
## 1 ARTS_ENTERTAINMENT_RECREATION        3.37
## 2 FINANCIAL_SERVICES_INSURANCE         3.04

arts_entertainment_recreation <- industry_hiVotes %>%
  filter(industry == "ARTS_ENTERTAINMENT_RECREATION") %>%
  pull(hiVote)

financial_services_insurance <- industry_hiVotes %>%
  filter(industry == "FINANCIAL_SERVICES_INSURANCE") %>%
  pull(hiVote)

t_test_result <- t.test(arts_entertainment_recreation, financial_services_insurance, var.equal = TRUE)
print(t_test_result)

## 
##  Two Sample t-test
## 
## data:  arts_entertainment_recreation and financial_services_insurance
## t = 2.3825, df = 793616, p-value = 0.0172
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05747963 0.59073657
## sample estimates:
## mean of x mean of y 
##  3.367347  3.043239

Question 6

company_happiness <- scoreVotes %>%
  group_by(companyId) %>%
  summarize(average_scoreVote = mean(scoreVote, na.rm = TRUE)) %>%
  arrange(desc(average_scoreVote))

top_company <- company_happiness %>% 
  slice_head(n = 1)

print("Top company")

## [1] "Top company"

print(top_company)

## # A tibble: 1 × 2
##   companyId                average_scoreVote
##   <chr>                                <dbl>
## 1 5c5031fd2e9d970004262a20              8.75

Question 7

happiness_threshold <- 8

company_happiness <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  group_by(companyId) %>%
  summarize(employee_count = n(), highly_satisfied = sum(scoreVote >= happiness_threshold, na.rm = TRUE)) %>%
  mutate(percentage_highly_satisfied = (highly_satisfied / employee_count) * 100) %>%
  arrange(desc(percentage_highly_satisfied))

top_company <- company_happiness %>% 
  slice_head(n = 1)

print("Top company")

## [1] "Top company"

print(top_company)

## # A tibble: 1 × 4
##   companyId               employee_count highly_satisfied percentage_highly_sa…¹
##   <chr>                            <int>            <int>                  <dbl>
## 1 5c73c78750b72e0004cab5…            575              453                   78.8
## # ℹ abbreviated name: ¹percentage_highly_satisfied

previous_top_company <- company_happiness %>%
  filter(companyId == "5c5031fd2e9d970004262a20")

print("Previous top company")

## [1] "Previous top company"

print(previous_top_company)

## # A tibble: 1 × 4
##   companyId               employee_count highly_satisfied percentage_highly_sa…¹
##   <chr>                            <int>            <int>                  <dbl>
## 1 5c5031fd2e9d970004262a…              4                3                     75
## # ℹ abbreviated name: ¹percentage_highly_satisfied

Question 8

hiVotes_industry <- hiVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(hiVote, industry)

anova_summary <- summary(aov(hiVote ~ industry, data = hiVotes_industry))
print(anova_summary)

##                  Df  Sum Sq Mean Sq F value Pr(>F)    
## industry         15   34824  2321.6    2456 <2e-16 ***
## Residuals   2302342 2176257     0.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 9

score_timeZone <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(scoreVote, timezone)

summary(lm(scoreVote ~ timezone, data = score_timeZone))

## 
## Call:
## lm(formula = scoreVote ~ timezone, data = score_timeZone)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9658 -2.5307  0.4693  2.4693  4.8389 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  6.27448    0.06591  95.195  < 2e-16 ***
## timezoneAmerica/Bogota      -0.01668    0.08257  -0.202 0.839922    
## timezoneAmerica/Guatemala    0.67738    0.06958   9.736  < 2e-16 ***
## timezoneAmerica/Guayaquil    0.69128    0.06793  10.176  < 2e-16 ***
## timezoneAmerica/Mexico_City  0.25625    0.06692   3.829 0.000129 ***
## timezoneAmerica/Santiago     0.43746    0.08244   5.307 1.12e-07 ***
## timezoneAmerica/Sao_Paulo    0.13692    0.06846   2.000 0.045495 *  
## timezoneEurope/Berlin       -0.13062    0.06774  -1.928 0.053813 .  
## timezoneEurope/London       -1.11338    0.07171 -15.525  < 2e-16 ***
## timezoneEurope/Luxembourg    0.20144    0.10736   1.876 0.060616 .  
## timezoneEurope/Madrid       -0.17735    0.06609  -2.683 0.007292 ** 
## timezoneGMT                 -0.06100    0.06896  -0.885 0.376408    
## timezonePacific/Galapagos    0.12552    0.55004   0.228 0.819482    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.73 on 495911 degrees of freedom
## Multiple R-squared:  0.0122, Adjusted R-squared:  0.01217 
## F-statistic: 510.3 on 12 and 495911 DF,  p-value: < 2.2e-16

Part 2: Reflecting on Data Collection and Analysis

Question 4a

hiVotes_company <- hiVotes %>%
  filter(companyId == "5f4798584672f900046c1329")

ggplot(data = hiVotes_company, aes(x = hiVote)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  scale_y_continuous(limits = c(0, max(table(hiVotes_company$hiVote)))) + 
  labs(title = "Employee Satisfaction Distribution for 5f4798584672f900046c1329", 
       x = "hiVote", y = "Frequency")

mean_hiVote <- round(mean(hiVotes_company$hiVote, na.rm = TRUE), 2)
print(paste("Mean hiVote:", mean_hiVote))

## [1] "Mean hiVote: 2.54"

mode_hiVote <- mode(hiVotes_company$hiVote)
print(paste("Mode hiVote:", mode_hiVote))

## [1] "Mode hiVote: numeric"

Question 4b

hiVotes_company <- hiVotes %>%
  filter(companyId == "5a5c65acda7ea50004af7996")

ggplot(data = hiVotes_company, aes(x = hiVote)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  scale_y_continuous(limits = c(0, max(table(hiVotes_company$hiVote)))) + 
  labs(title = "Employee Satisfaction Distribution for 5a5c65acda7ea50004af7996", 
       x = "hiVote", y = "Frequency")

low_hiVotes_count <- hiVotes_company %>%
  filter(hiVote == 1 | hiVote == 2) %>%
  count()
print(paste("Number of 1 or 2 hiVote scores:", low_hiVotes_count))

## [1] "Number of 1 or 2 hiVote scores: 9842"

high_hiVotes_count <- hiVotes_company %>%
  filter(hiVote == 3 | hiVote == 4) %>%
  count()
print(paste("Number of 3 or 4 hiVote scores:", high_hiVotes_count))

## [1] "Number of 3 or 4 hiVote scores: 37003"

mean_hiVote <- round(mean(hiVotes_company$hiVote, na.rm = TRUE), 2)
print(paste("Mean hiVote:", mean_hiVote))

## [1] "Mean hiVote: 3.11"

Question 5

department_happiness <- scoreVotes %>%
  group_by(departmentId) %>%
  summarize(average_scoreVote = mean(scoreVote, na.rm = TRUE)) %>%
  arrange(desc(average_scoreVote))

top_departments <- department_happiness %>% slice_head(n = 5)

ggplot(top_departments, aes(x = reorder(departmentId, average_scoreVote), y = average_scoreVote)) +
  geom_bar(stat = "identity", fill = "blue", color = "black") +
  coord_flip() +
  labs(title = "Top 5 Departments with the Highest Average scoreVote", x = "Department", y = "Average scoreVote")

R Depth Assignment

Phillip Gao

2024-11-06

Part 1: Computing Quantitative Measures

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9

Part 2: Reflecting on Data Collection and Analysis

Question 4a

Question 4b

Question 5