# Load necessary libraries
library(ggplot2)
library(ggfortify)  
## Warning: package 'ggfortify' was built under R version 4.3.3
library(dplyr)     
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the data
df<-read.csv("E:/Alliance University/Sem 3/ML2/wine.csv")
df
##   X   Wine Hedonic Meat Dessert Price Sugar Alcohol Acidity
## 1 1 Wine_1      14    7       8     7     7      13       7
## 2 2 Wine_2      10    7       6     4     3      14       7
## 3 3 Wine_3       8    5       5    10     5      12       5
## 4 4 Wine_4       2    4       7    16     7      11       3
## 5 5 Wine_5       6    2       4    13     3      10       3
# Remove non-numeric or unnecessary columns (if needed)
# Here, I am assuming 'X' is an identifier, and PCA is applied on numeric columns.
numeric_data <- df[, 3:9]  # Selecting numeric columns only

# Ensure that the data is numeric
numeric_data_cleaned <- numeric_data %>%
  mutate_if(is.factor, as.numeric)
# Perform PCA
pca_result <- prcomp(numeric_data_cleaned, scale. = TRUE)

# View the PCA result summary
summary(pca_result)
## Importance of components:
##                           PC1    PC2     PC3     PC4      PC5
## Standard deviation     2.1824 1.3454 0.59393 0.27276 5.09e-17
## Proportion of Variance 0.6804 0.2586 0.05039 0.01063 0.00e+00
## Cumulative Proportion  0.6804 0.9390 0.98937 1.00000 1.00e+00
# Biplot visualization using ggfortify
# This will visualize both the principal components and the loadings (contributions of original variables)
autoplot(pca_result, data = df, label = TRUE, label.size = 3, 
         loadings = TRUE, loadings.label = TRUE, loadings.label.size = 3) +
  labs(title = "PCA Biplot")

# Scree plot to show variance explained by each principal component
# Calculate the proportion of variance explained
variance_explained <- pca_result$sdev^2 / sum(pca_result$sdev^2)

# Create a data frame for plotting
pc_df <- data.frame(PC = paste0("PC", 1:length(variance_explained)),
                    Variance = variance_explained)

# Plot the scree plot (dotplot) using ggplot2
scree_plot <- ggplot(pc_df, aes(x = PC, y = Variance)) +
  geom_point() +
  geom_line(aes(group = 1)) +  # Ensure the line connects points
  labs(title = "Scree Plot",
       x = "Principal Component",
       y = "Proportion of Variance Explained") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Adjust x-axis text for readability

# Print the scree plot
print(scree_plot)