# Load the necessary libraries
library(missMDA) # I loaded this library because it provides PCA-based imputation methods for handling missing data.
library(ggplot2) # I used ggplot2 because it allows me to create detailed and customizable plots for data visualization.
# Step 1: Simulate production line data
set.seed(123) # I set a seed for reproducibility, ensuring that my simulated data remains consistent across runs.
n_stations <- 50 # I decided on 50 stations to represent a medium-sized production line.
n_metrics <- 4 # I chose 4 metrics to represent different aspects of production, such as speed, accuracy, downtime, and output quality.
# I created a matrix to simulate performance metrics across production stations.
production_data <- matrix(
rnorm(n_stations * n_metrics, mean = 100, sd = 15), # I used a normal distribution to generate realistic metric values.
nrow = n_stations,
ncol = n_metrics
)
colnames(production_data) <- c("Speed", "Accuracy", "Downtime", "OutputQuality") # I named the columns for clarity and interpretability.
# Step 2: Introduce missing values
missing_indices <- sample(1:(n_stations * n_metrics), size = 20) # I randomly selected 20 indices to simulate missing data.
production_data[missing_indices] <- NA # I assigned NA to these indices to represent missing observations.
# Step 3: Impute missing values using PCA
imputed_data <- imputePCA(production_data, ncp = 1) # I used 1 principal component to impute missing values based on correlations.
completed_data <- imputed_data$completeObs # I extracted the completed dataset after imputation.
# Step 4: Compare imputed and original values
# I identified the locations of missing values to compare the original and imputed data.
missing_coords <- which(is.na(production_data), arr.ind = TRUE)
imputed_values <- completed_data[missing_coords] # I retrieved the imputed values.
original_values <- production_data[missing_coords] # These remain NA because they were initially missing.
# I created a data frame to facilitate visualization of imputed versus original values.
comparison_df <- data.frame(
Station = missing_coords[, 1],
Metric = colnames(production_data)[missing_coords[, 2]],
ImputedValue = imputed_values
)
# Step 5: Visualize the imputed data
# I used ggplot2 to create a plot showing the imputed values for different metrics.
ggplot(comparison_df, aes(x = Station, y = ImputedValue, color = Metric)) +
geom_point(size = 3) + # I added points to represent imputed values for each station.
labs(
title = "Imputed Performance Metrics for Production Stations",
x = "Production Station",
y = "Imputed Metric Value"
) +
theme_minimal() +
scale_color_discrete(name = "Metric") # I included a legend to differentiate between metrics.

# Step 6: Analyze and summarize imputation results
cat("Summary of Imputation:\n") # I added a title for the summary output.
## Summary of Imputation:
cat("Number of missing values imputed:", length(imputed_values), "\n") # I displayed the total number of imputed values.
## Number of missing values imputed: 20
# Step 7: Perform PCA on the completed dataset
pca_results <- prcomp(completed_data, scale. = TRUE) # I performed PCA to analyze the variability in the data after imputation.
# Visualize PCA results
# I created a bar plot to show the proportion of variance explained by each principal component.
pca_variance <- data.frame(
Component = 1:length(pca_results$sdev),
Variance = (pca_results$sdev^2) / sum(pca_results$sdev^2)
)
ggplot(pca_variance, aes(x = Component, y = Variance)) +
geom_bar(stat = "identity", fill = "skyblue") + # I chose a bar plot to represent variance contributions.
labs(
title = "Variance Explained by Principal Components",
x = "Principal Component",
y = "Proportion of Variance"
) +
theme_minimal()

# Step 8: Final analysis
#Error in cor(imputed_values, original_values, use = "complete.obs") :
#no complete element pairs
#The error no complete element pairs in the cor function occurs because there are no matching elements between imputed_values and original_values. This might happen due to differences in dimensions, indices, or mismatched data alignment.And I will revisit this to fix the code.
#correlation_imputed <- cor(imputed_values, original_values, use = "complete.obs") # I #calculated the correlation to evaluate imputation accuracy.
#cat("Correlation between original and imputed values (where possible):", correlation_imputed, #"\n") # I displayed the correlation result.