data <-read.csv("C:\\Users\\Krishna\\Downloads\\productivity+prediction+of+garment+employees\\garments_worker_productivity.csv")
chooseCRANmirror(graphics=FALSE, ind=1)  # Choose a CRAN mirror interactively
install.packages("pwr")
## Installing package into 'C:/Users/Krishna/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'pwr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Krishna\AppData\Local\Temp\RtmpAHwjX7\downloaded_packages
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
str(data)
## 'data.frame':    1197 obs. of  15 variables:
##  $ date                 : chr  "01-01-2015" "01-01-2015" "01-01-2015" "01-01-2015" ...
##  $ quarter              : chr  "Quarter1" "Quarter1" "Quarter1" "Quarter1" ...
##  $ department           : chr  "sweing" "finishing " "sweing" "sweing" ...
##  $ day                  : chr  "Thursday" "Thursday" "Thursday" "Thursday" ...
##  $ team                 : int  8 1 11 12 6 7 2 3 2 1 ...
##  $ targeted_productivity: num  0.8 0.75 0.8 0.8 0.8 0.8 0.75 0.75 0.75 0.75 ...
##  $ smv                  : num  26.16 3.94 11.41 11.41 25.9 ...
##  $ wip                  : int  1108 NA 968 968 1170 984 NA 795 733 681 ...
##  $ over_time            : int  7080 960 3660 3660 1920 6720 960 6900 6000 6900 ...
##  $ incentive            : int  98 0 50 50 50 38 0 45 34 45 ...
##  $ idle_time            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ idle_men             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_style_change   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ no_of_workers        : num  59 8 30.5 30.5 56 56 8 57.5 55 57.5 ...
##  $ actual_productivity  : num  0.941 0.886 0.801 0.801 0.8 ...
# Define alpha level 
alpha <- 0.05

# Define power level
power <- 0.80

# Define minimum effect size
effect_size <- 0.50  

# Function to calculate minimum sample size
calculate_sample_size <- function(alpha, power, effect_size) {
  z_alpha <- qnorm(1 - alpha/2)
  z_beta <- qnorm(power)
  n <- ((z_alpha + z_beta) / effect_size)^2
  return(ceiling(n))
}

# Null Hypothesis 1: Difference in productivity between sewing and finishing departments
# Calculating sample size for each department
sewing_sample_size <- nrow(data[data$department == "sweing", ])
finishing_sample_size <- nrow(data[data$department == "finishing", ])

# Calculate minimum sample size required
min_sample_size_1 <- calculate_sample_size(alpha, power, effect_size)

# Null Hypothesis 2: Difference in productivity between weekdays and weekends
# Calculate sample size for weekdays and weekends
weekday_sample_size <- nrow(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ])
weekend_sample_size <- nrow(data[data$day %in% c("Saturday", "Sunday"), ])

# Calculate minimum sample size required
min_sample_size_2 <- calculate_sample_size(alpha, power, effect_size)

# Display minimum sample sizes
cat("Minimum sample size required for Hypothesis 1:", min_sample_size_1, "\n")
## Minimum sample size required for Hypothesis 1: 32
cat("Minimum sample size required for Hypothesis 2:", min_sample_size_2, "\n")
## Minimum sample size required for Hypothesis 2: 32
# Check if sample size is sufficient for Hypothesis 1
if (sewing_sample_size >= min_sample_size_1 & finishing_sample_size >= min_sample_size_1) {
  # Perform Neyman-Pearson hypothesis test if sample size is sufficient
  # For illustration purposes, let's assume we're using a t-test
  test_result_np_1 <- t.test(data[data$department == "sweing", ]$actual_productivity,
                        data[data$department == "finishing", ]$actual_productivity)
  print(test_result_np_1)
  
  # Perform two-sample t-test for Neyman-Pearson hypothesis test between the two groups
  t_test_result_1 <- t.test(data[data$department == "sweing", ]$actual_productivity,
                        data[data$department == "finishing", ]$actual_productivity)
  print(t_test_result_1)
  
  # Create a contingency table for Fisher's test
  contingency_table_1 <- table(data$department, data$actual_productivity > 0.8)
  
  # Perform Fisher's style test for significance on Null Hypothesis 1
  fisher_test_result_1 <- fisher.test(contingency_table_1)
  print(fisher_test_result_1)
} else {
  print("Insufficient data for hypothesis test 1")
}
## 
##  Welch Two Sample t-test
## 
## data:  data[data$department == "sweing", ]$actual_productivity and data[data$department == "finishing", ]$actual_productivity
## t = -0.060874, df = 356.72, p-value = 0.9515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02873231  0.02700700
## sample estimates:
## mean of x mean of y 
## 0.7220130 0.7228757 
## 
## 
##  Welch Two Sample t-test
## 
## data:  data[data$department == "sweing", ]$actual_productivity and data[data$department == "finishing", ]$actual_productivity
## t = -0.060874, df = 356.72, p-value = 0.9515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02873231  0.02700700
## sample estimates:
## mean of x mean of y 
## 0.7220130 0.7228757 
## 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table_1
## p-value = 3.685e-05
## alternative hypothesis: two.sided
# Check if sample size is sufficient for Hypothesis 2
if (weekday_sample_size >= min_sample_size_2 & weekend_sample_size >= min_sample_size_2) {
  # Perform Neyman-Pearson hypothesis test if sample size is sufficient
  # For illustration purposes, let's assume we're using a t-test
  test_result_np_2 <- t.test(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity,
                        data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity)
  print(test_result_np_2)
  
  # Perform two-sample t-test for Neyman-Pearson hypothesis test between the two groups
  t_test_result_2 <- t.test(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity,
                        data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity)
  print(t_test_result_2)
  
  # Create a contingency table for Fisher's test
  contingency_table_2 <- table(data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), 
                               data$actual_productivity > 0.8)
  
  # Perform Fisher's style test for significance on Null Hypothesis 2
  fisher_test_result_2 <- fisher.test(contingency_table_2)
  print(fisher_test_result_2)
} else {
  # Explain why there isn't enough data
  print("Insufficient data for hypothesis test 2")
}
## 
##  Welch Two Sample t-test
## 
## data:  data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity and data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity
## t = -0.64259, df = 754.95, p-value = 0.5207
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02825042  0.01431674
## sample estimates:
## mean of x mean of y 
## 0.7328212 0.7397880 
## 
## 
##  Welch Two Sample t-test
## 
## data:  data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity and data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity
## t = -0.64259, df = 754.95, p-value = 0.5207
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.02825042  0.01431674
## sample estimates:
## mean of x mean of y 
## 0.7328212 0.7397880 
## 
## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table_2
## p-value = 0.5781
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.7234027 1.1916835
## sample estimates:
## odds ratio 
##  0.9284004
# Build visualizations
# Visualization 1: Boxplot showing productivity distribution by department
ggplot(data, aes(x = department, y = actual_productivity)) +
  geom_boxplot() +
  labs(title = "Productivity Distribution by Department",
       x = "Department",
       y = "Actual Productivity")

# Visualization 2: Histogram showing productivity distribution on weekdays and weekends
ggplot(data, aes(x = actual_productivity, fill = day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))) +
  geom_histogram(binwidth = 0.05, alpha = 0.7) +
  scale_fill_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
  labs(title = "Productivity Distribution on Weekdays and Weekends",
       x = "Actual Productivity",
       y = "Frequency",
       fill = "Day") +
  theme_minimal()

Explanation for choosing alpha, power and minimum effect size

1)Alpha=

The alpha level, set at 0.05, determines the threshold for significance in hypothesis testing. In this manufacturing dataset, an alpha level of 0.05 means that there’s a 5% chance of incorrectly concluding that there’s a difference in productivity between groups (such as sewing and finishing departments) when there isn’t one

2)Power=

The power level, set at 0.80, indicates the probability of correctly detecting a true difference in productivity between groups if one exists. In other words, it represents the ability of the study to identify meaningful differences.

With a power level of 0.80, there’s an 80% chance of correctly identifying productivity differences between groups like weekdays and weekends or sewing and finishing departments. This ensures that the study has a high likelihood of detecting relevant differences, enhancing the reliability of its findings.

3)Minimum effect size=

The minimum effect size, chosen as 0.50, represents the smallest difference in productivity that the study aims to detect. Specifically, an effect size of 0.50 indicates a moderate difference between groups.

Insights

Null hypothesis 1=

The hypothesis test examines whether there is a significant difference in productivity between the sewing and finishing departments. The test results indicate whether there is evidence to reject the null hypothesis, suggesting that there is indeed a difference in productivity between the two departments.

Null hypothesis 2=

This hypothesis test examines whether there is a significant difference in productivity between weekdays and weekends. The results provide insights into whether productivity levels vary significantly based on the day of the week.