data <- read.csv("C:\\Users\\Krishna\\Downloads\\productivity+prediction+of+garment+employees\\garments_worker_productivity.csv")

options(repos = c(CRAN = "https://cran.rstudio.com/"))

GROUP1

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

groupset_1 <- data %>% group_by(quarter) %>% summarise(avg_productivity = mean(actual_productivity))

lowest_count_group_df1 <- groupset_1[which.min(groupset_1$avg_productivity), ] 
print(lowest_count_group_df1)

## # A tibble: 1 × 2
##   quarter  avg_productivity
##   <chr>               <dbl>
## 1 Quarter3            0.705

lowest_count_group_df1$tag <- "Lowest_Probability_Group_DF1"

data_merged <- merge(data, lowest_count_group_df1[, c("quarter", "tag")], by = "quarter", all.x = TRUE)

install.packages("ggplot2")

## Installing package into 'C:/Users/Krishna/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Krishna\AppData\Local\Temp\Rtmpw17dCg\downloaded_packages

library(ggplot2)

ggplot(groupset_1, aes(x = quarter, y = avg_productivity)) + geom_bar(stat = "identity")

INSIGHT=By grouping into quarters it can allow us to gain insights on overall trends and performance

GROUP BY DEPARTMENT

groupset_2 <- data %>% group_by(department) %>% summarise(total_wip = sum(wip))

lowest_count_group_df2 <- groupset_2[which.min(groupset_2$total_wip), ] 
print(lowest_count_group_df2)

## # A tibble: 1 × 2
##   department total_wip
##   <chr>          <int>
## 1 sweing        822612

lowest_count_group_df2$tag <- "Lowest_Probability_Group_DF2"

data_merged <- merge(data_merged, lowest_count_group_df2[, c("department", "tag")], by = "department", all.x = TRUE)

ggplot(groupset_2, aes(x = department, y = total_wip)) + geom_bar(stat = "identity")

## Warning: Removed 2 rows containing missing values (`position_stack()`).

INSIGHT=able to analyze productivity levels of each department and we can identify which departments performs well and which needs improvement

GROUP BY DAY

groupset_3 <- data %>% group_by(day) %>% summarise(total_incentive = sum(incentive))

lowest_count_group_df3 <- groupset_3[which.min(groupset_3$total_incentive), ] 
print(lowest_count_group_df3)

## # A tibble: 1 × 2
##   day    total_incentive
##   <chr>            <int>
## 1 Sunday            4906

lowest_count_group_df3$tag <- "Lowest_Probability_Group_DF3"

data_merged <- merge(data_merged, lowest_count_group_df3[, c("day", "tag")], by = "day", all.x = TRUE)

ggplot(groupset_3, aes(x = day, y = total_incentive)) + geom_bar(stat = "identity")

INSIGHTS=Grouping the data by day helps to find peak and off-peak days in the week, it helps organization to deploy the workforce according to the trends

Hypothesis and conclusion

1)for group by quarter=

The average monthly sales revenue in a given business varies across different quarters of the year. Specifically, I hypothesize that the average monthly sales revenue is significantly higher in the fourth quarter (October, November, December) compared to the other three quarters.

Conclusion=The data reveals noticable seasonal variation in sales revenue across four quarters which helps company to make use of resources in timely manner.

2)For group by department=

The hypothesis aims to explore whether there are statistically significant differences in sales performance across various departments within the organization.

Conclusion=

This is crucial for organisation decision making in terms of resource usage with the respective departments

3)For group by day=

The hypothesis aims to investigate whether there is a significant difference in the mean values of a specific variable, such as daily sales across different days of the week.

Conclusion=

Businesses can use the insights gained from grouping by day to create more accurate sales forecasts, marketing calendars, and operational plans, aligning resources with anticipated demand.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

existing_combinations <- data %>%
  select(department, day) %>%
  distinct()

all_combinations <- crossing(
  department = unique(data$department),
  day = unique(data$day)
)

missing_combinations <- setdiff(all_combinations, existing_combinations)
print(missing_combinations)

## # A tibble: 0 × 2
## # ℹ 2 variables: department <chr>, day <chr>

library(dplyr)

data %>%
  group_by(department, day) %>%
  summarize(count = n()) %>%
  arrange(desc(count))

## `summarise()` has grouped output by 'department'. You can override using the
## `.groups` argument.

## # A tibble: 18 × 3
## # Groups:   department [3]
##    department   day       count
##    <chr>        <chr>     <int>
##  1 "sweing"     Wednesday   119
##  2 "sweing"     Thursday    118
##  3 "sweing"     Tuesday     118
##  4 "sweing"     Monday      116
##  5 "sweing"     Sunday      116
##  6 "sweing"     Saturday    104
##  7 "finishing"  Wednesday    52
##  8 "finishing " Saturday     51
##  9 "finishing " Sunday       44
## 10 "finishing"  Sunday       43
## 11 "finishing"  Tuesday      43
## 12 "finishing " Monday       43
## 13 "finishing " Thursday     42
## 14 "finishing"  Monday       40
## 15 "finishing " Tuesday      40
## 16 "finishing"  Thursday     39
## 17 "finishing " Wednesday    37
## 18 "finishing"  Saturday     32

Insight= we can able to find insight on which combinations of categorical variables are frequently occuring and least occuring, so that organization can optimize resources for the production.

library(ggplot2)

data %>%
  group_by(department, day) %>%
  summarize(count = n()) %>%
  ggplot(aes(x = interaction(department, day), y = count, fill = department)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Frequency of Department-Day Combinations", x = "Department-Day", y = "Count")

## `summarise()` has grouped output by 'department'. You can override using the
## `.groups` argument.

Datadive week3

2024-01-29

GROUP1

GROUP BY DEPARTMENT

GROUP BY DAY

Hypothesis and conclusion