# Load the necessary libraries
library(missMDA)  # I loaded this library because it provides PCA-based imputation methods for handling missing data.
library(ggplot2)  # I used ggplot2 because it allows me to create detailed and customizable plots for data visualization.

# Step 1: Simulate production line data
set.seed(123)  # I set a seed for reproducibility, ensuring that my simulated data remains consistent across runs.
n_stations <- 50  # I decided on 50 stations to represent a medium-sized production line.
n_metrics <- 4  # I chose 4 metrics to represent different aspects of production, such as speed, accuracy, downtime, and output quality.

# I created a matrix to simulate performance metrics across production stations.
production_data <- matrix(
  rnorm(n_stations * n_metrics, mean = 100, sd = 15),  # I used a normal distribution to generate realistic metric values.
  nrow = n_stations,
  ncol = n_metrics
)
colnames(production_data) <- c("Speed", "Accuracy", "Downtime", "OutputQuality")  # I named the columns for clarity and interpretability.

# Step 2: Introduce missing values
missing_indices <- sample(1:(n_stations * n_metrics), size = 20)  # I randomly selected 20 indices to simulate missing data.
production_data[missing_indices] <- NA  # I assigned NA to these indices to represent missing observations.

# Step 3: Impute missing values using PCA
imputed_data <- imputePCA(production_data, ncp = 1)  # I used 1 principal component to impute missing values based on correlations.
completed_data <- imputed_data$completeObs  # I extracted the completed dataset after imputation.

# Step 4: Compare imputed and original values
# I identified the locations of missing values to compare the original and imputed data.
missing_coords <- which(is.na(production_data), arr.ind = TRUE)
imputed_values <- completed_data[missing_coords]  # I retrieved the imputed values.
original_values <- production_data[missing_coords]  # These remain NA because they were initially missing.

# I created a data frame to facilitate visualization of imputed versus original values.
comparison_df <- data.frame(
  Station = missing_coords[, 1],
  Metric = colnames(production_data)[missing_coords[, 2]],
  ImputedValue = imputed_values
)

# Step 5: Visualize the imputed data
# I used ggplot2 to create a plot showing the imputed values for different metrics.
ggplot(comparison_df, aes(x = Station, y = ImputedValue, color = Metric)) +
  geom_point(size = 3) +  # I added points to represent imputed values for each station.
  labs(
    title = "Imputed Performance Metrics for Production Stations",
    x = "Production Station",
    y = "Imputed Metric Value"
  ) +
  theme_minimal() +
  scale_color_discrete(name = "Metric")  # I included a legend to differentiate between metrics.

# Step 6: Analyze and summarize imputation results
cat("Summary of Imputation:\n")  # I added a title for the summary output.
## Summary of Imputation:
cat("Number of missing values imputed:", length(imputed_values), "\n")  # I displayed the total number of imputed values.
## Number of missing values imputed: 20
# Step 7: Perform PCA on the completed dataset
pca_results <- prcomp(completed_data, scale. = TRUE)  # I performed PCA to analyze the variability in the data after imputation.

# Visualize PCA results
# I created a bar plot to show the proportion of variance explained by each principal component.
pca_variance <- data.frame(
  Component = 1:length(pca_results$sdev),
  Variance = (pca_results$sdev^2) / sum(pca_results$sdev^2)
)

ggplot(pca_variance, aes(x = Component, y = Variance)) +
  geom_bar(stat = "identity", fill = "skyblue") +  # I chose a bar plot to represent variance contributions.
  labs(
    title = "Variance Explained by Principal Components",
    x = "Principal Component",
    y = "Proportion of Variance"
  ) +
  theme_minimal()

# Step 8: Final analysis
#Error in cor(imputed_values, original_values, use = "complete.obs") : 
#no complete element pairs
#The error no complete element pairs in the cor function occurs because there are no matching elements between imputed_values and original_values. This might happen due to differences in dimensions, indices, or mismatched data alignment.And I will revisit this to fix the code. 
#correlation_imputed <- cor(imputed_values, original_values, use = "complete.obs")  # I #calculated the correlation to evaluate imputation accuracy.
#cat("Correlation between original and imputed values (where possible):", correlation_imputed, #"\n")  # I displayed the correlation result.