data <-read.csv("C:\\Users\\Krishna\\Downloads\\productivity+prediction+of+garment+employees\\garments_worker_productivity.csv")
chooseCRANmirror(graphics=FALSE, ind=1) # Choose a CRAN mirror interactively
install.packages("pwr")
## Installing package into 'C:/Users/Krishna/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'pwr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Krishna\AppData\Local\Temp\RtmpAHwjX7\downloaded_packages
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(data)
## 'data.frame': 1197 obs. of 15 variables:
## $ date : chr "01-01-2015" "01-01-2015" "01-01-2015" "01-01-2015" ...
## $ quarter : chr "Quarter1" "Quarter1" "Quarter1" "Quarter1" ...
## $ department : chr "sweing" "finishing " "sweing" "sweing" ...
## $ day : chr "Thursday" "Thursday" "Thursday" "Thursday" ...
## $ team : int 8 1 11 12 6 7 2 3 2 1 ...
## $ targeted_productivity: num 0.8 0.75 0.8 0.8 0.8 0.8 0.75 0.75 0.75 0.75 ...
## $ smv : num 26.16 3.94 11.41 11.41 25.9 ...
## $ wip : int 1108 NA 968 968 1170 984 NA 795 733 681 ...
## $ over_time : int 7080 960 3660 3660 1920 6720 960 6900 6000 6900 ...
## $ incentive : int 98 0 50 50 50 38 0 45 34 45 ...
## $ idle_time : num 0 0 0 0 0 0 0 0 0 0 ...
## $ idle_men : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_style_change : int 0 0 0 0 0 0 0 0 0 0 ...
## $ no_of_workers : num 59 8 30.5 30.5 56 56 8 57.5 55 57.5 ...
## $ actual_productivity : num 0.941 0.886 0.801 0.801 0.8 ...
# Define alpha level
alpha <- 0.05
# Define power level
power <- 0.80
# Define minimum effect size
effect_size <- 0.50
# Function to calculate minimum sample size
calculate_sample_size <- function(alpha, power, effect_size) {
z_alpha <- qnorm(1 - alpha/2)
z_beta <- qnorm(power)
n <- ((z_alpha + z_beta) / effect_size)^2
return(ceiling(n))
}
# Null Hypothesis 1: Difference in productivity between sewing and finishing departments
# Calculating sample size for each department
sewing_sample_size <- nrow(data[data$department == "sweing", ])
finishing_sample_size <- nrow(data[data$department == "finishing", ])
# Calculate minimum sample size required
min_sample_size_1 <- calculate_sample_size(alpha, power, effect_size)
# Null Hypothesis 2: Difference in productivity between weekdays and weekends
# Calculate sample size for weekdays and weekends
weekday_sample_size <- nrow(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ])
weekend_sample_size <- nrow(data[data$day %in% c("Saturday", "Sunday"), ])
# Calculate minimum sample size required
min_sample_size_2 <- calculate_sample_size(alpha, power, effect_size)
# Display minimum sample sizes
cat("Minimum sample size required for Hypothesis 1:", min_sample_size_1, "\n")
## Minimum sample size required for Hypothesis 1: 32
cat("Minimum sample size required for Hypothesis 2:", min_sample_size_2, "\n")
## Minimum sample size required for Hypothesis 2: 32
# Check if sample size is sufficient for Hypothesis 1
if (sewing_sample_size >= min_sample_size_1 & finishing_sample_size >= min_sample_size_1) {
# Perform Neyman-Pearson hypothesis test if sample size is sufficient
# For illustration purposes, let's assume we're using a t-test
test_result_np_1 <- t.test(data[data$department == "sweing", ]$actual_productivity,
data[data$department == "finishing", ]$actual_productivity)
print(test_result_np_1)
# Perform two-sample t-test for Neyman-Pearson hypothesis test between the two groups
t_test_result_1 <- t.test(data[data$department == "sweing", ]$actual_productivity,
data[data$department == "finishing", ]$actual_productivity)
print(t_test_result_1)
# Create a contingency table for Fisher's test
contingency_table_1 <- table(data$department, data$actual_productivity > 0.8)
# Perform Fisher's style test for significance on Null Hypothesis 1
fisher_test_result_1 <- fisher.test(contingency_table_1)
print(fisher_test_result_1)
} else {
print("Insufficient data for hypothesis test 1")
}
##
## Welch Two Sample t-test
##
## data: data[data$department == "sweing", ]$actual_productivity and data[data$department == "finishing", ]$actual_productivity
## t = -0.060874, df = 356.72, p-value = 0.9515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02873231 0.02700700
## sample estimates:
## mean of x mean of y
## 0.7220130 0.7228757
##
##
## Welch Two Sample t-test
##
## data: data[data$department == "sweing", ]$actual_productivity and data[data$department == "finishing", ]$actual_productivity
## t = -0.060874, df = 356.72, p-value = 0.9515
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02873231 0.02700700
## sample estimates:
## mean of x mean of y
## 0.7220130 0.7228757
##
##
## Fisher's Exact Test for Count Data
##
## data: contingency_table_1
## p-value = 3.685e-05
## alternative hypothesis: two.sided
# Check if sample size is sufficient for Hypothesis 2
if (weekday_sample_size >= min_sample_size_2 & weekend_sample_size >= min_sample_size_2) {
# Perform Neyman-Pearson hypothesis test if sample size is sufficient
# For illustration purposes, let's assume we're using a t-test
test_result_np_2 <- t.test(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity,
data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity)
print(test_result_np_2)
# Perform two-sample t-test for Neyman-Pearson hypothesis test between the two groups
t_test_result_2 <- t.test(data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity,
data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity)
print(t_test_result_2)
# Create a contingency table for Fisher's test
contingency_table_2 <- table(data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"),
data$actual_productivity > 0.8)
# Perform Fisher's style test for significance on Null Hypothesis 2
fisher_test_result_2 <- fisher.test(contingency_table_2)
print(fisher_test_result_2)
} else {
# Explain why there isn't enough data
print("Insufficient data for hypothesis test 2")
}
##
## Welch Two Sample t-test
##
## data: data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity and data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity
## t = -0.64259, df = 754.95, p-value = 0.5207
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02825042 0.01431674
## sample estimates:
## mean of x mean of y
## 0.7328212 0.7397880
##
##
## Welch Two Sample t-test
##
## data: data[data$day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"), ]$actual_productivity and data[data$day %in% c("Saturday", "Sunday"), ]$actual_productivity
## t = -0.64259, df = 754.95, p-value = 0.5207
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.02825042 0.01431674
## sample estimates:
## mean of x mean of y
## 0.7328212 0.7397880
##
##
## Fisher's Exact Test for Count Data
##
## data: contingency_table_2
## p-value = 0.5781
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7234027 1.1916835
## sample estimates:
## odds ratio
## 0.9284004
# Build visualizations
# Visualization 1: Boxplot showing productivity distribution by department
ggplot(data, aes(x = department, y = actual_productivity)) +
geom_boxplot() +
labs(title = "Productivity Distribution by Department",
x = "Department",
y = "Actual Productivity")
# Visualization 2: Histogram showing productivity distribution on weekdays and weekends
ggplot(data, aes(x = actual_productivity, fill = day %in% c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday"))) +
geom_histogram(binwidth = 0.05, alpha = 0.7) +
scale_fill_manual(values = c("Weekday" = "blue", "Weekend" = "red")) +
labs(title = "Productivity Distribution on Weekdays and Weekends",
x = "Actual Productivity",
y = "Frequency",
fill = "Day") +
theme_minimal()
Explanation for choosing alpha, power and minimum effect size
1)Alpha=
The alpha level, set at 0.05, determines the threshold for significance in hypothesis testing. In this manufacturing dataset, an alpha level of 0.05 means that there’s a 5% chance of incorrectly concluding that there’s a difference in productivity between groups (such as sewing and finishing departments) when there isn’t one
2)Power=
The power level, set at 0.80, indicates the probability of correctly detecting a true difference in productivity between groups if one exists. In other words, it represents the ability of the study to identify meaningful differences.
With a power level of 0.80, there’s an 80% chance of correctly identifying productivity differences between groups like weekdays and weekends or sewing and finishing departments. This ensures that the study has a high likelihood of detecting relevant differences, enhancing the reliability of its findings.
3)Minimum effect size=
The minimum effect size, chosen as 0.50, represents the smallest difference in productivity that the study aims to detect. Specifically, an effect size of 0.50 indicates a moderate difference between groups.
Null hypothesis 1=
The hypothesis test examines whether there is a significant difference in productivity between the sewing and finishing departments. The test results indicate whether there is evidence to reject the null hypothesis, suggesting that there is indeed a difference in productivity between the two departments.
Null hypothesis 2=
This hypothesis test examines whether there is a significant difference in productivity between weekdays and weekends. The results provide insights into whether productivity levels vary significantly based on the day of the week.