#read csv file
garment_prod <-read.csv("/Users/lakshmimounikab/Desktop/Stats with R/R practice/garment_prod.csv")
garment_prod$team <- as.character(garment_prod$team)
View(garment_prod)
summary(garment_prod)
## date quarter department day
## Length:1197 Length:1197 Length:1197 Length:1197
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## team targeted_productivity smv wip
## Length:1197 Min. :0.0700 Min. : 2.90 Min. : 7.0
## Class :character 1st Qu.:0.7000 1st Qu.: 3.94 1st Qu.: 774.5
## Mode :character Median :0.7500 Median :15.26 Median : 1039.0
## Mean :0.7296 Mean :15.06 Mean : 1190.5
## 3rd Qu.:0.8000 3rd Qu.:24.26 3rd Qu.: 1252.5
## Max. :0.8000 Max. :54.56 Max. :23122.0
## NA's :506
## over_time incentive idle_time idle_men
## Min. : 0 Min. : 0.00 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 1440 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 3960 Median : 0.00 Median : 0.0000 Median : 0.0000
## Mean : 4567 Mean : 38.21 Mean : 0.7302 Mean : 0.3693
## 3rd Qu.: 6960 3rd Qu.: 50.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :25920 Max. :3600.00 Max. :300.0000 Max. :45.0000
##
## no_of_style_change no_of_workers actual_productivity
## Min. :0.0000 Min. : 2.00 Min. :0.2337
## 1st Qu.:0.0000 1st Qu.: 9.00 1st Qu.:0.6503
## Median :0.0000 Median :34.00 Median :0.7733
## Mean :0.1504 Mean :34.61 Mean :0.7351
## 3rd Qu.:0.0000 3rd Qu.:57.00 3rd Qu.:0.8503
## Max. :2.0000 Max. :89.00 Max. :1.1204
##
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(ggplot2)
sample_1 <- garment_prod |> sample_frac(0.5,replace = TRUE)
View(sample_1)
sample_1 |> pluck("targeted_productivity") |> mean()
## [1] 0.7371237
sample_1 |> pluck("wip") |> max()
## [1] NA
sample_1 |> pluck("actual_productivity") |> mean()
## [1] 0.7341585
I’ve randomly generated samples taking 7 columns into consideration : quarter, day, targeted_productivity, actual_productivity, smv, over_time, and no_of_workers.
num <-sample(5:10,1)
columns <- c("quarter", "day", "targeted_productivity", "actual_productivity", "smv", "over_time", "no_of_workers")
subsample_list <- list()
for (i in 1:num) {
s_size <- round(0.5 * nrow(garment_prod))
s_index <- sample(1:nrow(garment_prod), size = s_size, replace = TRUE)
subsample <- garment_prod[s_index, columns]
subsample_list[[i]] <- subsample
}
Printing summary statistics for each subsample created above using for-loop.
summary_table <- lapply(subsample_list, function(subsample){
summary_df <-summary(subsample)
knitr::kable(summary_df, caption = "Summary Statistics")
})
for (i in 1: num){
cat("### Subsample", i, "summary statisics \n")
print(summary_table[[i]])
}
## ### Subsample 1 summary statisics
##
##
## Table: Summary Statistics
##
## | | quarter | day |targeted_productivity |actual_productivity | smv | over_time |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## | |Length:598 |Length:598 |Min. :0.0700 |Min. :0.2380 |Min. : 2.90 |Min. : 0 |Min. : 2.00 |
## | |Class :character |Class :character |1st Qu.:0.7000 |1st Qu.:0.6503 |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 9.00 |
## | |Mode :character |Mode :character |Median :0.7500 |Median :0.7688 |Median :14.89 |Median : 3780 |Median :34.00 |
## | |NA |NA |Mean :0.7285 |Mean :0.7323 |Mean :14.60 |Mean : 4585 |Mean :33.87 |
## | |NA |NA |3rd Qu.:0.8000 |3rd Qu.:0.8501 |3rd Qu.:22.94 |3rd Qu.: 6840 |3rd Qu.:57.00 |
## | |NA |NA |Max. :0.8000 |Max. :1.1204 |Max. :50.48 |Max. :15120 |Max. :89.00 |
## ### Subsample 2 summary statisics
##
##
## Table: Summary Statistics
##
## | | quarter | day |targeted_productivity |actual_productivity | smv | over_time |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## | |Length:598 |Length:598 |Min. :0.0700 |Min. :0.2337 |Min. : 2.90 |Min. : 0 |Min. : 2.00 |
## | |Class :character |Class :character |1st Qu.:0.7000 |1st Qu.:0.6360 |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 8.00 |
## | |Mode :character |Mode :character |Median :0.7500 |Median :0.7544 |Median :11.61 |Median : 3780 |Median :33.00 |
## | |NA |NA |Mean :0.7318 |Mean :0.7272 |Mean :14.59 |Mean : 4417 |Mean :33.32 |
## | |NA |NA |3rd Qu.:0.8000 |3rd Qu.:0.8501 |3rd Qu.:24.26 |3rd Qu.: 6952 |3rd Qu.:57.00 |
## | |NA |NA |Max. :0.8000 |Max. :1.1204 |Max. :51.02 |Max. :14640 |Max. :60.00 |
## ### Subsample 3 summary statisics
##
##
## Table: Summary Statistics
##
## | | quarter | day |targeted_productivity |actual_productivity | smv | over_time |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## | |Length:598 |Length:598 |Min. :0.3500 |Min. :0.2380 |Min. : 2.90 |Min. : 0 |Min. : 5.00 |
## | |Class :character |Class :character |1st Qu.:0.7000 |1st Qu.:0.6504 |1st Qu.: 3.94 |1st Qu.: 1440 |1st Qu.: 9.00 |
## | |Mode :character |Mode :character |Median :0.7500 |Median :0.7733 |Median :15.26 |Median : 4080 |Median :34.00 |
## | |NA |NA |Mean :0.7334 |Mean :0.7364 |Mean :14.83 |Mean : 4548 |Mean :34.04 |
## | |NA |NA |3rd Qu.:0.8000 |3rd Qu.:0.8503 |3rd Qu.:23.41 |3rd Qu.: 6900 |3rd Qu.:57.00 |
## | |NA |NA |Max. :0.8000 |Max. :1.0580 |Max. :51.02 |Max. :10770 |Max. :89.00 |
## ### Subsample 4 summary statisics
##
##
## Table: Summary Statistics
##
## | | quarter | day |targeted_productivity |actual_productivity | smv | over_time |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## | |Length:598 |Length:598 |Min. :0.0700 |Min. :0.2337 |Min. : 2.90 |Min. : 0 |Min. : 2.00 |
## | |Class :character |Class :character |1st Qu.:0.7000 |1st Qu.:0.6500 |1st Qu.: 4.08 |1st Qu.: 1440 |1st Qu.: 9.00 |
## | |Mode :character |Mode :character |Median :0.7500 |Median :0.7508 |Median :15.26 |Median : 4080 |Median :35.00 |
## | |NA |NA |Mean :0.7331 |Mean :0.7238 |Mean :15.39 |Mean : 4662 |Mean :35.29 |
## | |NA |NA |3rd Qu.:0.8000 |3rd Qu.:0.8501 |3rd Qu.:23.41 |3rd Qu.: 6900 |3rd Qu.:57.00 |
## | |NA |NA |Max. :0.8000 |Max. :1.1204 |Max. :51.02 |Max. :25920 |Max. :60.00 |
## ### Subsample 5 summary statisics
##
##
## Table: Summary Statistics
##
## | | quarter | day |targeted_productivity |actual_productivity | smv | over_time |no_of_workers |
## |:--|:----------------|:----------------|:---------------------|:-------------------|:-------------|:-------------|:-------------|
## | |Length:598 |Length:598 |Min. :0.070 |Min. :0.2462 |Min. : 2.90 |Min. : 0 |Min. : 2.00 |
## | |Class :character |Class :character |1st Qu.:0.700 |1st Qu.:0.6468 |1st Qu.: 4.08 |1st Qu.: 1440 |1st Qu.:10.00 |
## | |Mode :character |Mode :character |Median :0.750 |Median :0.7507 |Median :15.26 |Median : 4080 |Median :35.00 |
## | |NA |NA |Mean :0.725 |Mean :0.7243 |Mean :15.73 |Mean : 4512 |Mean :35.56 |
## | |NA |NA |3rd Qu.:0.800 |3rd Qu.:0.8174 |3rd Qu.:25.75 |3rd Qu.: 6840 |3rd Qu.:57.00 |
## | |NA |NA |Max. :0.800 |Max. :1.0580 |Max. :54.56 |Max. :15120 |Max. :60.00 |
To understand the variation within the dataset, I’ve chosen one column named “no_of_workers”. To examine the subsample, I’ve chosen to use histograms.
# summary statistics for each subsamples
summary_stats <-lapply(subsample_list, summary)
# plot histogram
histograms <- lapply(subsample_list, function(subsample) {
ggplot(subsample, aes(x = no_of_workers)) +
geom_histogram(binwidth = 1, fill = 'blue', color = 'black') +
labs(title = "Histogram for SMV", x = 'Value', y = 'Frequency')
})
# display histograms and summary
for (i in 1:num) {
cat("Subsample", i, "summary statistics:\n")
print(summary_stats[[i]])
print(histograms[[i]])
}
## Subsample 1 summary statistics:
## quarter day targeted_productivity
## Length:598 Length:598 Min. :0.0700
## Class :character Class :character 1st Qu.:0.7000
## Mode :character Mode :character Median :0.7500
## Mean :0.7285
## 3rd Qu.:0.8000
## Max. :0.8000
## actual_productivity smv over_time no_of_workers
## Min. :0.2380 Min. : 2.90 Min. : 0 Min. : 2.00
## 1st Qu.:0.6503 1st Qu.: 3.94 1st Qu.: 1440 1st Qu.: 9.00
## Median :0.7688 Median :14.89 Median : 3780 Median :34.00
## Mean :0.7323 Mean :14.60 Mean : 4585 Mean :33.87
## 3rd Qu.:0.8501 3rd Qu.:22.94 3rd Qu.: 6840 3rd Qu.:57.00
## Max. :1.1204 Max. :50.48 Max. :15120 Max. :89.00
## Subsample 2 summary statistics:
## quarter day targeted_productivity
## Length:598 Length:598 Min. :0.0700
## Class :character Class :character 1st Qu.:0.7000
## Mode :character Mode :character Median :0.7500
## Mean :0.7318
## 3rd Qu.:0.8000
## Max. :0.8000
## actual_productivity smv over_time no_of_workers
## Min. :0.2337 Min. : 2.90 Min. : 0 Min. : 2.00
## 1st Qu.:0.6360 1st Qu.: 3.94 1st Qu.: 1440 1st Qu.: 8.00
## Median :0.7544 Median :11.61 Median : 3780 Median :33.00
## Mean :0.7272 Mean :14.59 Mean : 4417 Mean :33.32
## 3rd Qu.:0.8501 3rd Qu.:24.26 3rd Qu.: 6952 3rd Qu.:57.00
## Max. :1.1204 Max. :51.02 Max. :14640 Max. :60.00
## Subsample 3 summary statistics:
## quarter day targeted_productivity
## Length:598 Length:598 Min. :0.3500
## Class :character Class :character 1st Qu.:0.7000
## Mode :character Mode :character Median :0.7500
## Mean :0.7334
## 3rd Qu.:0.8000
## Max. :0.8000
## actual_productivity smv over_time no_of_workers
## Min. :0.2380 Min. : 2.90 Min. : 0 Min. : 5.00
## 1st Qu.:0.6504 1st Qu.: 3.94 1st Qu.: 1440 1st Qu.: 9.00
## Median :0.7733 Median :15.26 Median : 4080 Median :34.00
## Mean :0.7364 Mean :14.83 Mean : 4548 Mean :34.04
## 3rd Qu.:0.8503 3rd Qu.:23.41 3rd Qu.: 6900 3rd Qu.:57.00
## Max. :1.0580 Max. :51.02 Max. :10770 Max. :89.00
## Subsample 4 summary statistics:
## quarter day targeted_productivity
## Length:598 Length:598 Min. :0.0700
## Class :character Class :character 1st Qu.:0.7000
## Mode :character Mode :character Median :0.7500
## Mean :0.7331
## 3rd Qu.:0.8000
## Max. :0.8000
## actual_productivity smv over_time no_of_workers
## Min. :0.2337 Min. : 2.90 Min. : 0 Min. : 2.00
## 1st Qu.:0.6500 1st Qu.: 4.08 1st Qu.: 1440 1st Qu.: 9.00
## Median :0.7508 Median :15.26 Median : 4080 Median :35.00
## Mean :0.7238 Mean :15.39 Mean : 4662 Mean :35.29
## 3rd Qu.:0.8501 3rd Qu.:23.41 3rd Qu.: 6900 3rd Qu.:57.00
## Max. :1.1204 Max. :51.02 Max. :25920 Max. :60.00
## Subsample 5 summary statistics:
## quarter day targeted_productivity
## Length:598 Length:598 Min. :0.070
## Class :character Class :character 1st Qu.:0.700
## Mode :character Mode :character Median :0.750
## Mean :0.725
## 3rd Qu.:0.800
## Max. :0.800
## actual_productivity smv over_time no_of_workers
## Min. :0.2462 Min. : 2.90 Min. : 0 Min. : 2.00
## 1st Qu.:0.6468 1st Qu.: 4.08 1st Qu.: 1440 1st Qu.:10.00
## Median :0.7507 Median :15.26 Median : 4080 Median :35.00
## Mean :0.7243 Mean :15.73 Mean : 4512 Mean :35.56
## 3rd Qu.:0.8174 3rd Qu.:25.75 3rd Qu.: 6840 3rd Qu.:57.00
## Max. :1.0580 Max. :54.56 Max. :15120 Max. :60.00
On observing the histograms, I’ve observed that: - The frequency of each subsample varies from a range of 50- 150 or 200. - Overall variation of all the subsamples are not that different. - The peaks points might seem same but are different with respect the frequency.
To understand anomaly and consistency, I’ve chosen to go with mean and standard deviation for “no_of_workers” columns for each subsample.
means <- lapply(subsample_list, function(subsample) {
mean(subsample$no_of_workers)
})
sds <- lapply(subsample_list, function(subsample) {
sd(subsample$no_of_workers)
})
for (i in 1:num) {
cat("Subsample", i, "Mean of Number of workers:", means[[i]], "\n")
cat("Subsample", i, "SD of Number of workers:", sds[[i]], "\n")
}
## Subsample 1 Mean of Number of workers: 33.8704
## Subsample 1 SD of Number of workers: 22.33025
## Subsample 2 Mean of Number of workers: 33.31856
## Subsample 2 SD of Number of workers: 22.3672
## Subsample 3 Mean of Number of workers: 34.03512
## Subsample 3 SD of Number of workers: 22.11513
## Subsample 4 Mean of Number of workers: 35.28763
## Subsample 4 SD of Number of workers: 22.1017
## Subsample 5 Mean of Number of workers: 35.5602
## Subsample 5 SD of Number of workers: 21.84956
On observing the data, it is quite obvious that, the mean of each subsample, has no prominent variation. It ranges from 30 to 40. Similarly, coming to standard deviation, there is no significant change in the values. The value ranges from 21 to 23. This range and minimal deviation proves that the data for no_of_workers column within the siibsamples is consistent.