This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

if (!require(dplyr)) {
  install.packages("dplyr")
}
Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
if (!require(ggplot)) {
  install.packages("ggplot2")
}
Loading required package: ggplot
Warning: there is no package called ‘ggplot’WARNING: Rtools is required to build R packages but is not currently installed. Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/nickc/AppData/Local/R/win-library/4.4’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.4/ggplot2_3.5.1.zip'
Content type 'application/zip' length 5022358 bytes (4.8 MB)
downloaded 4.8 MB
package ‘ggplot2’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\nickc\AppData\Local\Temp\Rtmp8MDxkM\downloaded_packages
hiVotes <- read.csv("./data/hiVotes.csv")
companyMetadata <- read.csv("./data/companyMetadata.csv")
employees <- read.csv("./data/employees.csv")
scoreMetadata <- read.csv("./data/scoreMetadata.csv")
scoreVotes <- read.csv("./data/scoreVotes.csv")
print(mean(hiVotes$hiVote, na.rm = TRUE))
[1] 2.915459
print(sd(hiVotes$hiVote, na.rm=TRUE))
[1] 0.9799774
print(mean(scoreVotes$scoreVote, na.rm = TRUE))
[1] 6.229596
print(sd(scoreVotes$scoreVote, na.rm=TRUE))
[1] 2.747134
head(scoreVotes)
head(scoreMetadata)
wellbeing_questions <- subset(scoreMetadata, grepl("Wellbeing", name, ignore.case = TRUE))

library(dplyr)
merged_wellbeing_data <- inner_join(scoreVotes, wellbeing_questions, by = "questionId")

print(mean(merged_wellbeing_data$scoreVote, na.rm = TRUE))
[1] 6.301063
print(sd(merged_wellbeing_data$scoreVote, na.rm = TRUE))
[1] 2.6897
scoreMetadata
work_related_stress_question <- subset(scoreMetadata, grepl("5dd6e4a49a5137000450ff1d", questionId, ignore.case = FALSE))

head(work_related_stress_question)
merged_stress_data <- inner_join(scoreVotes, work_related_stress_question, by = "questionId")

print(mean(merged_stress_data$scoreVote, na.rm = TRUE))
[1] 7.560678
print(sd(merged_stress_data$scoreVote, na.rm = TRUE))
[1] 1.959066
library(ggplot2)
Warning: package ‘ggplot2’ was built under R version 4.4.2
top10_industries <- companyMetadata %>%
  group_by(industry) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice_max(order_by = count, n = 10)

ggplot(data = top10_industries, aes(x = reorder(industry, count), y = count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() +
  labs(
    title = "Top 10 Industries by Number of Companies",
    x = "Industry",
    y = "Number of Companies"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

merged_score_data <- inner_join(scoreVotes, scoreMetadata, by = "scoreId")
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
ggplot(data = merged_score_data, aes(x = name, y = scoreVote)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.color = "red") +
  labs(
    title = "Distribution of Score Votes by Score Category",
    x = "Score Categories (Name)",
    y = "Score Votes"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 12)
  )

wellbeing_metadata <- scoreMetadata %>%
  filter(name == "Wellbeing")

wellbeing_votes <- scoreVotes %>%
  inner_join(wellbeing_metadata, by = "scoreId")
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
wellbeing_with_company <- wellbeing_votes %>%
  inner_join(companyMetadata, by = "companyId")

average_scores <- wellbeing_with_company %>%
  group_by(companyId) %>%
  summarise(avg_wellbeing_score = mean(scoreVote, na.rm = TRUE)) %>%
  arrange(desc(avg_wellbeing_score))

top_company <- average_scores %>%
  slice_max(order_by = avg_wellbeing_score, n = 1)

top_company_details <- top_company %>%
  inner_join(companyMetadata, by = "companyId")

print(top_company_details)
target_companies <- companyMetadata %>%
  filter(industry %in% c("ARTS_ENTERTAINMENT_RECREATION", "FINANCIAL_SERVICES_INSURANCE"))

industry_votes <- scoreVotes %>%
  inner_join(target_companies, by = "companyId") %>%
  select(industry, scoreVote)

arts_recreation <- industry_votes %>%
  filter(industry == "ARTS_ENTERTAINMENT_RECREATION") %>%
  pull(scoreVote)

financial_services <- industry_votes %>%
  filter(industry == "FINANCIAL_SERVICES_INSURANCE") %>%
  pull(scoreVote)

mean_arts <- mean(arts_recreation, na.rm = TRUE)
mean_financial <- mean(financial_services, na.rm = TRUE)

t_test_result <- t.test(arts_recreation, financial_services, var.equal = TRUE)

list(
  Mean_Arts_Entertainment_Recreation = mean_arts,
  Mean_Financial_Services_Insurance = mean_financial,
  P_Value = t_test_result$p.value,
  T_Test_Result = t_test_result
)
$Mean_Arts_Entertainment_Recreation
[1] 6.662791

$Mean_Financial_Services_Insurance
[1] 6.606734

$P_Value
[1] 0.8539026

$T_Test_Result

    Two Sample t-test

data:  arts_recreation and financial_services
t = 0.18414, df = 98839, p-value = 0.8539
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.5406072  0.6527210
sample estimates:
mean of x mean of y 
 6.662791  6.606734 
merged_happiness_data <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(companyId, companyName = industry, hiVote = scoreVote)

happiest_companies <- merged_happiness_data %>%
  group_by(companyId, companyName) %>%
  summarise(
    avg_happiness = mean(hiVote, na.rm = TRUE),
    sd_happiness = sd(hiVote, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_happiness))
`summarise()` has grouped output by 'companyId'. You can override using the `.groups` argument.
print(happiest_companies)
happiest_companies_composite <- merged_happiness_data %>%
  group_by(companyId, companyName) %>%
  summarise(
    avg_happiness = mean(hiVote, na.rm = TRUE),
    sd_happiness = sd(hiVote, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  ) %>%
  mutate(
    composite_score = avg_happiness - sd_happiness + log(count)
  ) %>%
  arrange(desc(composite_score))

top_composite_company <- happiest_companies_composite %>%
  slice_max(order_by = composite_score, n = 1)

head(happiest_companies_composite)
merged_industry_data <- inner_join(hiVotes, companyMetadata, by = "companyId") %>%
  select(hiVote, industry)

anova_result <- aov(hiVote ~ industry, data = merged_industry_data)

anova_summary <- summary(anova_result)

anova_p_value <- anova_summary[[1]]$`Pr(>F)`[1]
print(paste("P-value for ANOVA:", anova_p_value))
[1] "P-value for ANOVA: 0"
num_industries <- length(unique(merged_industry_data$industry))

print((num_industries * (num_industries - 1)) / 2)
[1] 120
str(companyMetadata)
'data.frame':   147 obs. of  3 variables:
 $ companyId: chr  "57908a2622881200033b34d7" "57c4aa7dbb8b5c000396fd3b" "56fd2b64f41c670003f643c8" "57ac8b23be7fe30003e656d0" ...
 $ industry : chr  "COMPUTER_SOFTWARE_IT_SERVICES" "HEALTH_CARE_SOCIAL_ASSISTANCE" "MANUFACTURING" "COMPUTER_SOFTWARE_IT_SERVICES" ...
 $ timezone : chr  "Europe/Madrid" "America/Guayaquil" "Europe/Madrid" "Europe/Madrid" ...
merged_timezone_data <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(scoreVote, timezone)

lm_result <- lm(scoreVote ~ timezone, data = merged_timezone_data)

summary_lm <- summary(lm_result)

print(summary_lm)

Call:
lm(formula = scoreVote ~ timezone, data = merged_timezone_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.9658 -2.5307  0.4693  2.4693  4.8389 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  6.27448    0.06591  95.195  < 2e-16 ***
timezoneAmerica/Bogota      -0.01668    0.08257  -0.202 0.839922    
timezoneAmerica/Guatemala    0.67738    0.06958   9.736  < 2e-16 ***
timezoneAmerica/Guayaquil    0.69128    0.06793  10.176  < 2e-16 ***
timezoneAmerica/Mexico_City  0.25625    0.06692   3.829 0.000129 ***
timezoneAmerica/Santiago     0.43746    0.08244   5.307 1.12e-07 ***
timezoneAmerica/Sao_Paulo    0.13692    0.06846   2.000 0.045495 *  
timezoneEurope/Berlin       -0.13062    0.06774  -1.928 0.053813 .  
timezoneEurope/London       -1.11338    0.07171 -15.525  < 2e-16 ***
timezoneEurope/Luxembourg    0.20144    0.10736   1.876 0.060616 .  
timezoneEurope/Madrid       -0.17735    0.06609  -2.683 0.007292 ** 
timezoneGMT                 -0.06100    0.06896  -0.885 0.376408    
timezonePacific/Galapagos    0.12552    0.55004   0.228 0.819482    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.73 on 495911 degrees of freedom
Multiple R-squared:  0.0122,    Adjusted R-squared:  0.01217 
F-statistic: 510.3 on 12 and 495911 DF,  p-value: < 2.2e-16
coeff_table <- as.data.frame(summary_lm$coefficients)

# Add row names (time zones) to the coefficients table
coeff_table$TimeZone <- rownames(coeff_table)

# Exclude the Intercept
coeff_table <- coeff_table[coeff_table$TimeZone != "(Intercept)", ]

# Identify the strongest effect (largest absolute coefficient)
strongest_effect <- coeff_table[which.max(abs(coeff_table$Estimate)), ]

# Identify the most significant effect (smallest p-value)
most_significant_effect <- coeff_table[which.min(coeff_table$`Pr(>|t|)`), ]

# Print results
print("Time zone with the strongest effect:")
[1] "Time zone with the strongest effect:"
print(strongest_effect)

print("Time zone with the most significant effect:")
[1] "Time zone with the most significant effect:"
print(most_significant_effect)
merged_department_data <- hiVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(departmentId, hiVote)

dept_avg <- merged_department_data %>%
  group_by(departmentId) %>%
  summarise(avg_hiVote = mean(hiVote, na.rm = TRUE)) %>%
  arrange(desc(avg_hiVote))

overall_avg <- mean(merged_department_data$hiVote, na.rm = TRUE)

top_department <- dept_avg[1, ]

print(paste("Top department:", top_department$departmentId))
[1] "Top department: 595d0ee77b727a0004a19ad0"
print(paste("Average hiVote for top department:", round(top_department$avg_hiVote, 2)))
[1] "Average hiVote for top department: 4"
print(paste("Overall company average hiVote:", round(overall_avg, 2)))
[1] "Overall company average hiVote: 2.92"
---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 
```{r}
if (!require(dplyr)) {
  install.packages("dplyr")
}

if (!require(ggplot)) {
  install.packages("ggplot2")
}

```
```{r}
hiVotes <- read.csv("./data/hiVotes.csv")
companyMetadata <- read.csv("./data/companyMetadata.csv")
employees <- read.csv("./data/employees.csv")
scoreMetadata <- read.csv("./data/scoreMetadata.csv")
scoreVotes <- read.csv("./data/scoreVotes.csv")

```


```{r}
print(mean(hiVotes$hiVote, na.rm = TRUE))
```


```{r}
print(sd(hiVotes$hiVote, na.rm=TRUE))
```
```{r}
print(mean(scoreVotes$scoreVote, na.rm = TRUE))
```
```{r}
print(sd(scoreVotes$scoreVote, na.rm=TRUE))

```
```{r}
head(scoreVotes)
```
```{r}
head(scoreMetadata)
```
```{r}
wellbeing_questions <- subset(scoreMetadata, grepl("Wellbeing", name, ignore.case = TRUE))

library(dplyr)
merged_wellbeing_data <- inner_join(scoreVotes, wellbeing_questions, by = "questionId")

print(mean(merged_wellbeing_data$scoreVote, na.rm = TRUE))

```
```{r}
print(sd(merged_wellbeing_data$scoreVote, na.rm = TRUE))

```
```{r}
scoreMetadata
```


```{r}
work_related_stress_question <- subset(scoreMetadata, grepl("5dd6e4a49a5137000450ff1d", questionId, ignore.case = FALSE))

head(work_related_stress_question)
```
```{r}
merged_stress_data <- inner_join(scoreVotes, work_related_stress_question, by = "questionId")

print(mean(merged_stress_data$scoreVote, na.rm = TRUE))
```
```{r}
print(sd(merged_stress_data$scoreVote, na.rm = TRUE))

```

```{r}
library(ggplot2)

top10_industries <- companyMetadata %>%
  group_by(industry) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  slice_max(order_by = count, n = 10)

ggplot(data = top10_industries, aes(x = reorder(industry, count), y = count)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() +
  labs(
    title = "Top 10 Industries by Number of Companies",
    x = "Industry",
    y = "Number of Companies"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )
```
```{r}
merged_score_data <- inner_join(scoreVotes, scoreMetadata, by = "scoreId")

ggplot(data = merged_score_data, aes(x = name, y = scoreVote)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.color = "red") +
  labs(
    title = "Distribution of Score Votes by Score Category",
    x = "Score Categories (Name)",
    y = "Score Votes"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 12)
  )

```
```{r}
wellbeing_metadata <- scoreMetadata %>%
  filter(name == "Wellbeing")

wellbeing_votes <- scoreVotes %>%
  inner_join(wellbeing_metadata, by = "scoreId")

wellbeing_with_company <- wellbeing_votes %>%
  inner_join(companyMetadata, by = "companyId")

average_scores <- wellbeing_with_company %>%
  group_by(companyId) %>%
  summarise(avg_wellbeing_score = mean(scoreVote, na.rm = TRUE)) %>%
  arrange(desc(avg_wellbeing_score))

top_company <- average_scores %>%
  slice_max(order_by = avg_wellbeing_score, n = 1)

top_company_details <- top_company %>%
  inner_join(companyMetadata, by = "companyId")

print(top_company_details)
```
```{r}
target_companies <- companyMetadata %>%
  filter(industry %in% c("ARTS_ENTERTAINMENT_RECREATION", "FINANCIAL_SERVICES_INSURANCE"))

industry_votes <- scoreVotes %>%
  inner_join(target_companies, by = "companyId") %>%
  select(industry, scoreVote)

arts_recreation <- industry_votes %>%
  filter(industry == "ARTS_ENTERTAINMENT_RECREATION") %>%
  pull(scoreVote)

financial_services <- industry_votes %>%
  filter(industry == "FINANCIAL_SERVICES_INSURANCE") %>%
  pull(scoreVote)

mean_arts <- mean(arts_recreation, na.rm = TRUE)
mean_financial <- mean(financial_services, na.rm = TRUE)

t_test_result <- t.test(arts_recreation, financial_services, var.equal = TRUE)

list(
  Mean_Arts_Entertainment_Recreation = mean_arts,
  Mean_Financial_Services_Insurance = mean_financial,
  P_Value = t_test_result$p.value,
  T_Test_Result = t_test_result
)
```
```{r}
merged_happiness_data <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(companyId, companyName = industry, hiVote = scoreVote)

happiest_companies <- merged_happiness_data %>%
  group_by(companyId, companyName) %>%
  summarise(
    avg_happiness = mean(hiVote, na.rm = TRUE),
    sd_happiness = sd(hiVote, na.rm = TRUE),
    count = n()
  ) %>%
  arrange(desc(avg_happiness))

print(happiest_companies)
```
```{r}
happiest_companies_composite <- merged_happiness_data %>%
  group_by(companyId, companyName) %>%
  summarise(
    avg_happiness = mean(hiVote, na.rm = TRUE),
    sd_happiness = sd(hiVote, na.rm = TRUE),
    count = n(),
    .groups = "drop"
  ) %>%
  mutate(
    composite_score = avg_happiness - sd_happiness + log(count)
  ) %>%
  arrange(desc(composite_score))

top_composite_company <- happiest_companies_composite %>%
  slice_max(order_by = composite_score, n = 1)

head(happiest_companies_composite)
```
```{r}
merged_industry_data <- inner_join(hiVotes, companyMetadata, by = "companyId") %>%
  select(hiVote, industry)

anova_result <- aov(hiVote ~ industry, data = merged_industry_data)

anova_summary <- summary(anova_result)

anova_p_value <- anova_summary[[1]]$`Pr(>F)`[1]
print(paste("P-value for ANOVA:", anova_p_value))
```

```{r}
num_industries <- length(unique(merged_industry_data$industry))

print((num_industries * (num_industries - 1)) / 2)
```
```{r}
str(companyMetadata)
```



```{r}
merged_timezone_data <- scoreVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(scoreVote, timezone)

lm_result <- lm(scoreVote ~ timezone, data = merged_timezone_data)

summary_lm <- summary(lm_result)

print(summary_lm)
```

```{r}
coeff_table <- as.data.frame(summary_lm$coefficients)

# Add row names (time zones) to the coefficients table
coeff_table$TimeZone <- rownames(coeff_table)

# Exclude the Intercept
coeff_table <- coeff_table[coeff_table$TimeZone != "(Intercept)", ]

# Identify the strongest effect (largest absolute coefficient)
strongest_effect <- coeff_table[which.max(abs(coeff_table$Estimate)), ]

# Identify the most significant effect (smallest p-value)
most_significant_effect <- coeff_table[which.min(coeff_table$`Pr(>|t|)`), ]

# Print results
print("Time zone with the strongest effect:")
print(strongest_effect)

print("Time zone with the most significant effect:")
print(most_significant_effect)
```
```{r}
merged_department_data <- hiVotes %>%
  inner_join(companyMetadata, by = "companyId") %>%
  select(departmentId, hiVote)

dept_avg <- merged_department_data %>%
  group_by(departmentId) %>%
  summarise(avg_hiVote = mean(hiVote, na.rm = TRUE)) %>%
  arrange(desc(avg_hiVote))

overall_avg <- mean(merged_department_data$hiVote, na.rm = TRUE)

top_department <- dept_avg[1, ]

print(paste("Top department:", top_department$departmentId))
print(paste("Average hiVote for top department:", round(top_department$avg_hiVote, 2)))
print(paste("Overall company average hiVote:", round(overall_avg, 2)))
```


