# Load the required libraries
library(ggplot2)
library(agricolae)
library(cowplot)

# Create the data frame
my_data <- data.frame(
  ID = 1:10,
  Department = c("marketing", "marketing", "sales", "sales", "it", "it", "it", "hr", "hr", "marketing"),
  Gender = c("female", "male", "female", "male", "female", "male", "female", "male", "female", "female"),
  Salary = c(50000, 60000, 55000, 45000, 75000, 80000, 72000, 48000, 51000, 62000),
  Experience = c(2, 4, 3, 1, 5, 7, 4, 2, 3, 3)
)

# 1. Z-test
# Given data
national_avg_salary <- 55000
pop_sd <- 15000
alpha <- 0.05

# Calculate sample mean
sample_mean <- mean(my_data$Salary)

# Calculate sample size
n <- nrow(my_data)

# Calculate Z-score
Z <- (sample_mean - national_avg_salary) / (pop_sd / sqrt(n))

# Calculate p-value
p_value <- 2 * (1 - pnorm(abs(Z)))

# Print Z-score and p-value
cat("Z-score:", Z, "\n")
## Z-score: 1.011929
cat("p-value:", p_value, "\n")
## p-value: 0.3115721
# Check if null hypothesis is rejected
if (p_value < alpha) {
  cat("Reject null hypothesis: Average salary across all departments is significantly different from the national average salary.\n")
} else {
  cat("Fail to reject null hypothesis: Average salary across all departments is not significantly different from the national average salary.\n")
}
## Fail to reject null hypothesis: Average salary across all departments is not significantly different from the national average salary.
# 2. Two sample T-test 
# Given data
alpha <- 0.05

# Subset data for male and female employees
male_salaries <- my_data$Salary[my_data$Gender == "male"]
female_salaries <- my_data$Salary[my_data$Gender == "female"]

# Conduct two-sample t-test
t_test_result <- t.test(male_salaries, female_salaries, var.equal = TRUE)

# Print the test result
print(t_test_result)
## 
##  Two Sample t-test
## 
## data:  male_salaries and female_salaries
## t = -0.3102, df = 8, p-value = 0.7643
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -21787.42  16620.75
## sample estimates:
## mean of x mean of y 
##  58250.00  60833.33
# Interpret the results
if (t_test_result$p.value < alpha) {
  cat("\nReject the null hypothesis: There is a significant difference in the average salaries between male and female employees.\n")
} else {
  cat("\nFail to reject the null hypothesis: There is no significant difference in the average salaries between male and female employees.\n")
}
## 
## Fail to reject the null hypothesis: There is no significant difference in the average salaries between male and female employees.
# 3. ANOVA 
# Given data
alpha <- 0.05

# Perform one-way ANOVA
anova_result <- aov(Salary ~ Department, data = my_data)

# Summary of the ANOVA
anova_summary <- summary(anova_result)
print(anova_summary)
##             Df    Sum Sq   Mean Sq F value  Pr(>F)   
## Department   3 1.178e+09 392588889   13.87 0.00417 **
## Residuals    6 1.698e+08  28305556                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract p-value
p_value <- anova_summary[[1]]$`Pr(>F)`[1]

# Interpret the results
if (p_value < alpha) {
  cat("\nReject the null hypothesis: There are statistically significant differences in average salaries across the four departments.\n")
} else {
  cat("\nFail to reject the null hypothesis: There are no statistically significant differences in average salaries across the four departments.\n")
}
## 
## Reject the null hypothesis: There are statistically significant differences in average salaries across the four departments.
# 4. Post hoc test for one way ANOVA
# Given data
alpha <- 0.05

# Perform one-way ANOVA
anova_result <- aov(Salary ~ Department, data = my_data)

# Check if ANOVA result is significant
if (summary(anova_result)[[1]]$`Pr(>F)`[1] < alpha) {
  # Perform Tukey's HSD test
  tukey_result <- HSD.test(anova_result, "Department", console = TRUE)
  
  # Print the Tukey's HSD test result
  print(tukey_result)
} else {
  cat("The ANOVA did not show significant differences.")
}
## 
## Study: anova_result ~ "Department"
## 
## HSD Test for Salary 
## 
## Mean Square Error:  28305556 
## 
## Department,  means
## 
##             Salary      std r       se   Min   Max   Q25   Q50   Q75
## hr        49500.00 2121.320 2 3762.018 48000 51000 48750 49500 50250
## it        75666.67 4041.452 3 3071.675 72000 80000 73500 75000 77500
## marketing 57333.33 6429.101 3 3071.675 50000 62000 55000 60000 61000
## sales     50000.00 7071.068 2 3762.018 45000 55000 47500 50000 52500
## 
## Alpha: 0.05 ; DF Error: 6 
## Critical Value of Studentized Range: 4.895599 
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Treatments with the same letter are not significantly different.
## 
##             Salary groups
## it        75666.67      a
## marketing 57333.33      b
## sales     50000.00      b
## hr        49500.00      b
## $statistics
##    MSerror Df  Mean       CV
##   28305556  6 59800 8.896817
## 
## $parameters
##    test     name.t ntr StudentizedRange alpha
##   Tukey Department   4         4.895599  0.05
## 
## $means
##             Salary      std r       se   Min   Max   Q25   Q50   Q75
## hr        49500.00 2121.320 2 3762.018 48000 51000 48750 49500 50250
## it        75666.67 4041.452 3 3071.675 72000 80000 73500 75000 77500
## marketing 57333.33 6429.101 3 3071.675 50000 62000 55000 60000 61000
## sales     50000.00 7071.068 2 3762.018 45000 55000 47500 50000 52500
## 
## $comparison
## NULL
## 
## $groups
##             Salary groups
## it        75666.67      a
## marketing 57333.33      b
## sales     50000.00      b
## hr        49500.00      b
## 
## attr(,"class")
## [1] "group"
# 5. Visualization
# Scatter plot: Salary vs Experience
scatter_plot <- ggplot(my_data, aes(x = Experience, y = Salary)) +
  geom_point() +
  labs(title = "Salary vs Experience",
       x = "Experience",
       y = "Salary")

# Histogram of Salaries
histogram <- ggplot(my_data, aes(x = Salary)) +
  geom_histogram(binwidth = 10000, fill = "blue", color = "black") +
  labs(title = "Histogram of Salaries",
       x = "Salary",
       y = "Frequency")

# Box plot: Salary distribution by Department and Gender
box_plot <- ggplot(my_data, aes(x = Department, y = Salary, fill = Gender)) +
  geom_boxplot() +
  labs(title = "Salary Distribution by Department and Gender",
       x = "Department",
       y = "Salary",
       fill = "Gender")

# Combine plots
plot_grid(scatter_plot, histogram, box_plot, ncol = 2, labels = "AUTO")

# Insights:
# 1. From the scatter plot, we can observe that there is a positive correlation between experience and salary. 
#    As experience increases, salary tends to increase as well.
# 2. The histogram of salaries shows that most of the employees earn salaries between $45,000 and $80,000.
# 3. The box plot illustrates the salary distribution across departments and gender. 
#    It shows the median, quartiles, and outliers within each group.