Load CSV file

#read csv file
garment_prod <-read.csv("/Users/lakshmimounikab/Desktop/Stats with R/R practice/garment_prod.csv")
garment_prod$team <- as.character(garment_prod$team)
View(garment_prod)
summary(garment_prod)
##      date             quarter           department            day           
##  Length:1197        Length:1197        Length:1197        Length:1197       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##      team           targeted_productivity      smv             wip         
##  Length:1197        Min.   :0.0700        Min.   : 2.90   Min.   :    7.0  
##  Class :character   1st Qu.:0.7000        1st Qu.: 3.94   1st Qu.:  774.5  
##  Mode  :character   Median :0.7500        Median :15.26   Median : 1039.0  
##                     Mean   :0.7296        Mean   :15.06   Mean   : 1190.5  
##                     3rd Qu.:0.8000        3rd Qu.:24.26   3rd Qu.: 1252.5  
##                     Max.   :0.8000        Max.   :54.56   Max.   :23122.0  
##                                                           NA's   :506      
##    over_time       incentive         idle_time           idle_men      
##  Min.   :    0   Min.   :   0.00   Min.   :  0.0000   Min.   : 0.0000  
##  1st Qu.: 1440   1st Qu.:   0.00   1st Qu.:  0.0000   1st Qu.: 0.0000  
##  Median : 3960   Median :   0.00   Median :  0.0000   Median : 0.0000  
##  Mean   : 4567   Mean   :  38.21   Mean   :  0.7302   Mean   : 0.3693  
##  3rd Qu.: 6960   3rd Qu.:  50.00   3rd Qu.:  0.0000   3rd Qu.: 0.0000  
##  Max.   :25920   Max.   :3600.00   Max.   :300.0000   Max.   :45.0000  
##                                                                        
##  no_of_style_change no_of_workers   actual_productivity
##  Min.   :0.0000     Min.   : 2.00   Min.   :0.2337     
##  1st Qu.:0.0000     1st Qu.: 9.00   1st Qu.:0.6503     
##  Median :0.0000     Median :34.00   Median :0.7733     
##  Mean   :0.1504     Mean   :34.61   Mean   :0.7351     
##  3rd Qu.:0.0000     3rd Qu.:57.00   3rd Qu.:0.8503     
##  Max.   :2.0000     Max.   :89.00   Max.   :1.1204     
## 

Load required libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)

Static random sampling with aggregate values

sample_1 <- garment_prod |> sample_frac(0.5,replace = TRUE) 
View(sample_1)
sample_1 |> pluck("targeted_productivity") |> mean()
## [1] 0.7371237
sample_1 |> pluck("wip") |> max()
## [1] NA
sample_1 |> pluck("actual_productivity") |> mean()
## [1] 0.7341585

Random sampling of garment_prod dataset

I’ve randomly generated samples taking 7 columns into consideration : quarter, day, targeted_productivity, actual_productivity, smv, over_time, and no_of_workers.

num <-sample(5:10,1)
columns <- c("quarter", "day", "targeted_productivity", "actual_productivity", "smv", "over_time", "no_of_workers")
subsample_list <- list()
for (i in 1:num) {
  s_size <- round(0.5 * nrow(garment_prod))
  s_index <- sample(1:nrow(garment_prod), size = s_size, replace = TRUE)
  subsample <- garment_prod[s_index, columns]
  subsample_list[[i]] <- subsample
}

Summaries of the sample data

Printing summary statistics for each subsample created above using for-loop.

summary_table <- lapply(subsample_list, function(subsample){
  summary_df <-summary(subsample)
  knitr::kable(summary_df, caption = "Summary Statistics")
}) 
for (i in 1: num){
  cat("### Subsample", i, "summary statisics \n")
  print(summary_table[[i]])
}
## ### Subsample 1 summary statisics 
## 
## 
## Table: Summary Statistics
## 
## |   |  quarter        |    day          |targeted_productivity |actual_productivity |     smv      |  over_time   |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## |   |Length:598       |Length:598       |Min.   :0.0700        |Min.   :0.2380      |Min.   : 2.90 |Min.   :    0 |Min.   : 2.00 |
## |   |Class :character |Class :character |1st Qu.:0.7000        |1st Qu.:0.6503      |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 9.00 |
## |   |Mode  :character |Mode  :character |Median :0.7500        |Median :0.7688      |Median :14.89 |Median : 3780 |Median :34.00 |
## |   |NA               |NA               |Mean   :0.7285        |Mean   :0.7323      |Mean   :14.60 |Mean   : 4585 |Mean   :33.87 |
## |   |NA               |NA               |3rd Qu.:0.8000        |3rd Qu.:0.8501      |3rd Qu.:22.94 |3rd Qu.: 6840 |3rd Qu.:57.00 |
## |   |NA               |NA               |Max.   :0.8000        |Max.   :1.1204      |Max.   :50.48 |Max.   :15120 |Max.   :89.00 |
## ### Subsample 2 summary statisics 
## 
## 
## Table: Summary Statistics
## 
## |   |  quarter        |    day          |targeted_productivity |actual_productivity |     smv      |  over_time   |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## |   |Length:598       |Length:598       |Min.   :0.0700        |Min.   :0.2337      |Min.   : 2.90 |Min.   :    0 |Min.   : 2.00 |
## |   |Class :character |Class :character |1st Qu.:0.7000        |1st Qu.:0.6360      |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 8.00 |
## |   |Mode  :character |Mode  :character |Median :0.7500        |Median :0.7544      |Median :11.61 |Median : 3780 |Median :33.00 |
## |   |NA               |NA               |Mean   :0.7318        |Mean   :0.7272      |Mean   :14.59 |Mean   : 4417 |Mean   :33.32 |
## |   |NA               |NA               |3rd Qu.:0.8000        |3rd Qu.:0.8501      |3rd Qu.:24.26 |3rd Qu.: 6952 |3rd Qu.:57.00 |
## |   |NA               |NA               |Max.   :0.8000        |Max.   :1.1204      |Max.   :51.02 |Max.   :14640 |Max.   :60.00 |
## ### Subsample 3 summary statisics 
## 
## 
## Table: Summary Statistics
## 
## |   |  quarter        |    day          |targeted_productivity |actual_productivity |     smv      |  over_time   |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## |   |Length:598       |Length:598       |Min.   :0.3500        |Min.   :0.2380      |Min.   : 2.90 |Min.   :    0 |Min.   : 5.00 |
## |   |Class :character |Class :character |1st Qu.:0.7000        |1st Qu.:0.6504      |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 9.00 |
## |   |Mode  :character |Mode  :character |Median :0.7500        |Median :0.7733      |Median :15.26 |Median : 4080 |Median :34.00 |
## |   |NA               |NA               |Mean   :0.7334        |Mean   :0.7364      |Mean   :14.83 |Mean   : 4548 |Mean   :34.04 |
## |   |NA               |NA               |3rd Qu.:0.8000        |3rd Qu.:0.8503      |3rd Qu.:23.41 |3rd Qu.: 6900 |3rd Qu.:57.00 |
## |   |NA               |NA               |Max.   :0.8000        |Max.   :1.0580      |Max.   :51.02 |Max.   :10770 |Max.   :89.00 |
## ### Subsample 4 summary statisics 
## 
## 
## Table: Summary Statistics
## 
## |   |  quarter        |    day          |targeted_productivity |actual_productivity |     smv      |  over_time   |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## |   |Length:598       |Length:598       |Min.   :0.0700        |Min.   :0.2337      |Min.   : 2.90 |Min.   :    0 |Min.   : 2.00 |
## |   |Class :character |Class :character |1st Qu.:0.7000        |1st Qu.:0.6500      |1st Qu.: 4.08 |1st Qu.: 1440 |1st Qu.: 9.00 |
## |   |Mode  :character |Mode  :character |Median :0.7500        |Median :0.7508      |Median :15.26 |Median : 4080 |Median :35.00 |
## |   |NA               |NA               |Mean   :0.7331        |Mean   :0.7238      |Mean   :15.39 |Mean   : 4662 |Mean   :35.29 |
## |   |NA               |NA               |3rd Qu.:0.8000        |3rd Qu.:0.8501      |3rd Qu.:23.41 |3rd Qu.: 6900 |3rd Qu.:57.00 |
## |   |NA               |NA               |Max.   :0.8000        |Max.   :1.1204      |Max.   :51.02 |Max.   :25920 |Max.   :60.00 |
## ### Subsample 5 summary statisics 
## 
## 
## Table: Summary Statistics
## 
## |   |  quarter        |    day          |targeted_productivity |actual_productivity |     smv      |  over_time   |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## |   |Length:598       |Length:598       |Min.   :0.070         |Min.   :0.2462      |Min.   : 2.90 |Min.   :    0 |Min.   : 2.00 |
## |   |Class :character |Class :character |1st Qu.:0.700         |1st Qu.:0.6468      |1st Qu.: 4.08 |1st Qu.: 1440 |1st Qu.:10.00 |
## |   |Mode  :character |Mode  :character |Median :0.750         |Median :0.7507      |Median :15.26 |Median : 4080 |Median :35.00 |
## |   |NA               |NA               |Mean   :0.725         |Mean   :0.7243      |Mean   :15.73 |Mean   : 4512 |Mean   :35.56 |
## |   |NA               |NA               |3rd Qu.:0.800         |3rd Qu.:0.8174      |3rd Qu.:25.75 |3rd Qu.: 6840 |3rd Qu.:57.00 |
## |   |NA               |NA               |Max.   :0.800         |Max.   :1.0580      |Max.   :54.56 |Max.   :15120 |Max.   :60.00 |

Scrutinizing subsamples

To understand the variation within the dataset, I’ve chosen one column named “no_of_workers”. To examine the subsample, I’ve chosen to use histograms.

# summary statistics for each subsamples
summary_stats <-lapply(subsample_list, summary)
# plot histogram
histograms <- lapply(subsample_list, function(subsample) {
  ggplot(subsample, aes(x = no_of_workers)) +
    geom_histogram(binwidth = 1, fill = 'blue', color = 'black') +
    labs(title = "Histogram for SMV", x = 'Value', y = 'Frequency')
})
# display histograms and summary
for (i in 1:num) {
  cat("Subsample", i, "summary statistics:\n")
  print(summary_stats[[i]])
  print(histograms[[i]])
}
## Subsample 1 summary statistics:
##    quarter              day            targeted_productivity
##  Length:598         Length:598         Min.   :0.0700       
##  Class :character   Class :character   1st Qu.:0.7000       
##  Mode  :character   Mode  :character   Median :0.7500       
##                                        Mean   :0.7285       
##                                        3rd Qu.:0.8000       
##                                        Max.   :0.8000       
##  actual_productivity      smv          over_time     no_of_workers  
##  Min.   :0.2380      Min.   : 2.90   Min.   :    0   Min.   : 2.00  
##  1st Qu.:0.6503      1st Qu.: 3.94   1st Qu.: 1440   1st Qu.: 9.00  
##  Median :0.7688      Median :14.89   Median : 3780   Median :34.00  
##  Mean   :0.7323      Mean   :14.60   Mean   : 4585   Mean   :33.87  
##  3rd Qu.:0.8501      3rd Qu.:22.94   3rd Qu.: 6840   3rd Qu.:57.00  
##  Max.   :1.1204      Max.   :50.48   Max.   :15120   Max.   :89.00

## Subsample 2 summary statistics:
##    quarter              day            targeted_productivity
##  Length:598         Length:598         Min.   :0.0700       
##  Class :character   Class :character   1st Qu.:0.7000       
##  Mode  :character   Mode  :character   Median :0.7500       
##                                        Mean   :0.7318       
##                                        3rd Qu.:0.8000       
##                                        Max.   :0.8000       
##  actual_productivity      smv          over_time     no_of_workers  
##  Min.   :0.2337      Min.   : 2.90   Min.   :    0   Min.   : 2.00  
##  1st Qu.:0.6360      1st Qu.: 3.94   1st Qu.: 1440   1st Qu.: 8.00  
##  Median :0.7544      Median :11.61   Median : 3780   Median :33.00  
##  Mean   :0.7272      Mean   :14.59   Mean   : 4417   Mean   :33.32  
##  3rd Qu.:0.8501      3rd Qu.:24.26   3rd Qu.: 6952   3rd Qu.:57.00  
##  Max.   :1.1204      Max.   :51.02   Max.   :14640   Max.   :60.00

## Subsample 3 summary statistics:
##    quarter              day            targeted_productivity
##  Length:598         Length:598         Min.   :0.3500       
##  Class :character   Class :character   1st Qu.:0.7000       
##  Mode  :character   Mode  :character   Median :0.7500       
##                                        Mean   :0.7334       
##                                        3rd Qu.:0.8000       
##                                        Max.   :0.8000       
##  actual_productivity      smv          over_time     no_of_workers  
##  Min.   :0.2380      Min.   : 2.90   Min.   :    0   Min.   : 5.00  
##  1st Qu.:0.6504      1st Qu.: 3.94   1st Qu.: 1440   1st Qu.: 9.00  
##  Median :0.7733      Median :15.26   Median : 4080   Median :34.00  
##  Mean   :0.7364      Mean   :14.83   Mean   : 4548   Mean   :34.04  
##  3rd Qu.:0.8503      3rd Qu.:23.41   3rd Qu.: 6900   3rd Qu.:57.00  
##  Max.   :1.0580      Max.   :51.02   Max.   :10770   Max.   :89.00

## Subsample 4 summary statistics:
##    quarter              day            targeted_productivity
##  Length:598         Length:598         Min.   :0.0700       
##  Class :character   Class :character   1st Qu.:0.7000       
##  Mode  :character   Mode  :character   Median :0.7500       
##                                        Mean   :0.7331       
##                                        3rd Qu.:0.8000       
##                                        Max.   :0.8000       
##  actual_productivity      smv          over_time     no_of_workers  
##  Min.   :0.2337      Min.   : 2.90   Min.   :    0   Min.   : 2.00  
##  1st Qu.:0.6500      1st Qu.: 4.08   1st Qu.: 1440   1st Qu.: 9.00  
##  Median :0.7508      Median :15.26   Median : 4080   Median :35.00  
##  Mean   :0.7238      Mean   :15.39   Mean   : 4662   Mean   :35.29  
##  3rd Qu.:0.8501      3rd Qu.:23.41   3rd Qu.: 6900   3rd Qu.:57.00  
##  Max.   :1.1204      Max.   :51.02   Max.   :25920   Max.   :60.00

## Subsample 5 summary statistics:
##    quarter              day            targeted_productivity
##  Length:598         Length:598         Min.   :0.070        
##  Class :character   Class :character   1st Qu.:0.700        
##  Mode  :character   Mode  :character   Median :0.750        
##                                        Mean   :0.725        
##                                        3rd Qu.:0.800        
##                                        Max.   :0.800        
##  actual_productivity      smv          over_time     no_of_workers  
##  Min.   :0.2462      Min.   : 2.90   Min.   :    0   Min.   : 2.00  
##  1st Qu.:0.6468      1st Qu.: 4.08   1st Qu.: 1440   1st Qu.:10.00  
##  Median :0.7507      Median :15.26   Median : 4080   Median :35.00  
##  Mean   :0.7243      Mean   :15.73   Mean   : 4512   Mean   :35.56  
##  3rd Qu.:0.8174      3rd Qu.:25.75   3rd Qu.: 6840   3rd Qu.:57.00  
##  Max.   :1.0580      Max.   :54.56   Max.   :15120   Max.   :60.00

On observing the histograms, I’ve observed that: - The frequency of each subsample varies from a range of 50- 150 or 200. - Overall variation of all the subsamples are not that different. - The peaks points might seem same but are different with respect the frequency.

Anomalies and consistency

To understand anomaly and consistency, I’ve chosen to go with mean and standard deviation for “no_of_workers” columns for each subsample.

means <- lapply(subsample_list, function(subsample) {
  mean(subsample$no_of_workers)
})
sds <- lapply(subsample_list, function(subsample) {
  sd(subsample$no_of_workers)
})
for (i in 1:num) {
  cat("Subsample", i, "Mean of Number of workers:", means[[i]], "\n")
  cat("Subsample", i, "SD of Number of workers:", sds[[i]], "\n")
}
## Subsample 1 Mean of Number of workers: 33.8704 
## Subsample 1 SD of Number of workers: 22.33025 
## Subsample 2 Mean of Number of workers: 33.31856 
## Subsample 2 SD of Number of workers: 22.3672 
## Subsample 3 Mean of Number of workers: 34.03512 
## Subsample 3 SD of Number of workers: 22.11513 
## Subsample 4 Mean of Number of workers: 35.28763 
## Subsample 4 SD of Number of workers: 22.1017 
## Subsample 5 Mean of Number of workers: 35.5602 
## Subsample 5 SD of Number of workers: 21.84956

On observing the data, it is quite obvious that, the mean of each subsample, has no prominent variation. It ranges from 30 to 40. Similarly, coming to standard deviation, there is no significant change in the values. The value ranges from 21 to 23. This range and minimal deviation proves that the data for no_of_workers column within the siibsamples is consistent.