Principal Component Analysis (PCA) is a fundamental technique for dimensionality reduction. This report applies PCA to a well-known wine dataset containing the chemical analysis of 13 constituents for wines derived from three different cultivars. The objective is to reduce the dimensionality of the data, visualize the underlying structure, and investigate whether the chemical profiles alone can distinguish the wine types.
library(tidyverse)
library(factoextra)
library(ggrepel)
library(mclust)
library(gridExtra)
library(ggplot2)
# Load the wine dataset
wine_data <- read.csv("Wine dataset.csv")
# Rename the first column for clarity
colnames(wine_data)[1] <- "Wine_Class"
# Inspect the structure and first rows
head(wine_data)
## Wine_Class Alcohol Malic.acid Ash Alcalinity.of.ash Magnesium Total.phenols
## 1 1 14.23 1.71 2.43 15.6 127 2.80
## 2 1 13.20 1.78 2.14 11.2 100 2.65
## 3 1 13.16 2.36 2.67 18.6 101 2.80
## 4 1 14.37 1.95 2.50 16.8 113 3.85
## 5 1 13.24 2.59 2.87 21.0 118 2.80
## 6 1 14.20 1.76 2.45 15.2 112 3.27
## Flavanoids Nonflavanoid.phenols Proanthocyanins Color.intensity Hue
## 1 3.06 0.28 2.29 5.64 1.04
## 2 2.76 0.26 1.28 4.38 1.05
## 3 3.24 0.30 2.81 5.68 1.03
## 4 3.49 0.24 2.18 7.80 0.86
## 5 2.69 0.39 1.82 4.32 1.04
## 6 3.39 0.34 1.97 6.75 1.05
## OD280.OD315.of.diluted.wines Proline
## 1 3.92 1065
## 2 3.40 1050
## 3 3.17 1185
## 4 3.45 1480
## 5 2.93 735
## 6 2.85 1450
str(wine_data)
## 'data.frame': 178 obs. of 14 variables:
## $ Wine_Class : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Alcohol : num 14.2 13.2 13.2 14.4 13.2 ...
## $ Malic.acid : num 1.71 1.78 2.36 1.95 2.59 1.76 1.87 2.15 1.64 1.35 ...
## $ Ash : num 2.43 2.14 2.67 2.5 2.87 2.45 2.45 2.61 2.17 2.27 ...
## $ Alcalinity.of.ash : num 15.6 11.2 18.6 16.8 21 15.2 14.6 17.6 14 16 ...
## $ Magnesium : int 127 100 101 113 118 112 96 121 97 98 ...
## $ Total.phenols : num 2.8 2.65 2.8 3.85 2.8 3.27 2.5 2.6 2.8 2.98 ...
## $ Flavanoids : num 3.06 2.76 3.24 3.49 2.69 3.39 2.52 2.51 2.98 3.15 ...
## $ Nonflavanoid.phenols : num 0.28 0.26 0.3 0.24 0.39 0.34 0.3 0.31 0.29 0.22 ...
## $ Proanthocyanins : num 2.29 1.28 2.81 2.18 1.82 1.97 1.98 1.25 1.98 1.85 ...
## $ Color.intensity : num 5.64 4.38 5.68 7.8 4.32 6.75 5.25 5.05 5.2 7.22 ...
## $ Hue : num 1.04 1.05 1.03 0.86 1.04 1.05 1.02 1.06 1.08 1.01 ...
## $ OD280.OD315.of.diluted.wines: num 3.92 3.4 3.17 3.45 2.93 2.85 3.58 3.58 2.85 3.55 ...
## $ Proline : int 1065 1050 1185 1480 735 1450 1290 1295 1045 1045 ...
# Check for missing values in the dataset
sum(is.na(wine_data))
## [1] 0
# Store the target variable (wine class) separately
wine_class <- wine_data$Wine_Class
# Extract only the chemical features (all columns except the first one)
wine_features <- wine_data[, -1]
# Verify the dimensions
dim(wine_features)
## [1] 178 13
# Standardize the features using scale() function
wine_scaled <- scale(wine_features)
# Convert to data frame for better handling
wine_scaled <- as.data.frame(wine_scaled)
# Check the mean and standard deviation of the scaled data
colMeans(wine_scaled)
## Alcohol Malic.acid
## 7.823954e-15 2.943962e-16
## Ash Alcalinity.of.ash
## -4.082876e-15 -5.488743e-17
## Magnesium Total.phenols
## -7.609394e-17 1.234967e-16
## Flavanoids Nonflavanoid.phenols
## 9.580351e-16 -1.635396e-15
## Proanthocyanins Color.intensity
## -1.666270e-15 -4.216353e-16
## Hue OD280.OD315.of.diluted.wines
## 1.657850e-15 2.200487e-15
## Proline
## -1.341000e-16
apply(wine_scaled, 2, sd)
## Alcohol Malic.acid
## 1 1
## Ash Alcalinity.of.ash
## 1 1
## Magnesium Total.phenols
## 1 1
## Flavanoids Nonflavanoid.phenols
## 1 1
## Proanthocyanins Color.intensity
## 1 1
## Hue OD280.OD315.of.diluted.wines
## 1 1
## Proline
## 1
# Perform PCA on the standardized data
pca_result <- prcomp(wine_scaled, center = TRUE, scale. = FALSE)
# Summary of PCA results
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.169 1.5802 1.2025 0.95863 0.92370 0.80103 0.74231
## Proportion of Variance 0.362 0.1921 0.1112 0.07069 0.06563 0.04936 0.04239
## Cumulative Proportion 0.362 0.5541 0.6653 0.73599 0.80162 0.85098 0.89337
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.59034 0.53748 0.5009 0.47517 0.41082 0.32152
## Proportion of Variance 0.02681 0.02222 0.0193 0.01737 0.01298 0.00795
## Cumulative Proportion 0.92018 0.94240 0.9617 0.97907 0.99205 1.00000
# Extract eigenvalues (variances of principal components)
eigenvalues <- pca_result$sdev^2
# Create a data frame for eigenvalues
eigenvalues_df <- data.frame(
PC = paste0("PC", 1:length(eigenvalues)),
Eigenvalue = eigenvalues,
Variance_Explained = eigenvalues / sum(eigenvalues) * 100,
Cumulative_Variance = cumsum(eigenvalues / sum(eigenvalues) * 100)
)
# Display the eigenvalue table
print(eigenvalues_df)
## PC Eigenvalue Variance_Explained Cumulative_Variance
## 1 PC1 4.7058503 36.1988481 36.19885
## 2 PC2 2.4969737 19.2074903 55.40634
## 3 PC3 1.4460720 11.1236305 66.52997
## 4 PC4 0.9189739 7.0690302 73.59900
## 5 PC5 0.8532282 6.5632937 80.16229
## 6 PC6 0.6416570 4.9358233 85.09812
## 7 PC7 0.5510283 4.2386793 89.33680
## 8 PC8 0.3484974 2.6807489 92.01754
## 9 PC9 0.2888799 2.2221534 94.23970
## 10 PC10 0.2509025 1.9300191 96.16972
## 11 PC11 0.2257886 1.7368357 97.90655
## 12 PC12 0.1687702 1.2982326 99.20479
## 13 PC13 0.1033779 0.7952149 100.00000
fviz_eig(pca_result,
addlabels = TRUE,
ylim = c(0, 50),
main = "Scree Plot: Variance Explained by Principal Components",
xlab = "Principal Components",
ylab = "Percentage of Explained Variance",
barfill = "#2E9FDF",
barcolor = "#2E9FDF",
linecolor = "#FC4E07") +
theme_minimal()
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.
# Create a biplot showing both samples and variables
fviz_pca_biplot(pca_result,
col.ind = as.factor(wine_class),
palette = c("#2E9FDF", "#FC4E07", "#E7B800"),
col.var = "black",
addEllipses = TRUE,
ellipse.level = 0.95,
ellipse.type = "confidence",
legend.title = "Wine Class",
repel = TRUE,
title = "PCA Biplot: Wine Chemical Analysis",
xlab = paste0("PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "%)"),
ylab = paste0("PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "%)"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Ignoring unknown labels:
## • linetype : "Wine Class"
# Plot only the individual wines (samples)
fviz_pca_ind(pca_result,
col.ind = as.factor(wine_class),
palette = c("#2E9FDF", "#FC4E07", "#E7B800"),
addEllipses = TRUE,
ellipse.type = "confidence",
legend.title = "Wine Class",
repel = TRUE,
title = "PCA: Wine Samples Projection",
xlab = paste0("PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "%)"),
ylab = paste0("PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "%)"))
## Ignoring unknown labels:
## • linetype : "Wine Class"
# Visualize the contributions of variables to the principal components
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
title = "Variable Contributions to PC1 and PC2",
xlab = paste0("PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "%)"),
ylab = paste0("PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "%)"))
# Extract loadings (rotation matrix)
loadings <- pca_result$rotation
# Create a data frame for loadings of PC1 and PC2
loadings_df <- data.frame(
Variable = rownames(loadings),
PC1_Loading = loadings[, 1],
PC2_Loading = loadings[, 2],
PC1_Contribution = abs(loadings[, 1]) * 100,
PC2_Contribution = abs(loadings[, 2]) * 100
)
# Sort by absolute contribution to PC1
loadings_df_sorted <- loadings_df[order(-abs(loadings_df$PC1_Loading)), ]
# Display the top variables contributing to PC1
head(loadings_df_sorted, 10)
## Variable PC1_Loading
## Flavanoids Flavanoids -0.4229343
## Total.phenols Total.phenols -0.3946608
## OD280.OD315.of.diluted.wines OD280.OD315.of.diluted.wines -0.3761674
## Proanthocyanins Proanthocyanins -0.3134295
## Nonflavanoid.phenols Nonflavanoid.phenols 0.2985331
## Hue Hue -0.2967146
## Proline Proline -0.2867522
## Malic.acid Malic.acid 0.2451876
## Alcalinity.of.ash Alcalinity.of.ash 0.2393204
## Alcohol Alcohol -0.1443294
## PC2_Loading PC1_Contribution PC2_Contribution
## Flavanoids 0.003359812 42.29343 0.3359812
## Total.phenols -0.065039512 39.46608 6.5039512
## OD280.OD315.of.diluted.wines 0.164496193 37.61674 16.4496193
## Proanthocyanins -0.039301722 31.34295 3.9301722
## Nonflavanoid.phenols -0.028779488 29.85331 2.8779488
## Hue 0.279235148 29.67146 27.9235148
## Proline -0.364902832 28.67522 36.4902832
## Malic.acid -0.224930935 24.51876 22.4930935
## Alcalinity.of.ash 0.010590502 23.93204 1.0590502
## Alcohol -0.483651548 14.43294 48.3651548
# Sort by absolute contribution to PC2
loadings_df_sorted_pc2 <- loadings_df[order(-abs(loadings_df$PC2_Loading)), ]
# Display the top variables contributing to PC2
head(loadings_df_sorted_pc2, 10)
## Variable PC1_Loading
## Color.intensity Color.intensity 0.088616705
## Alcohol Alcohol -0.144329395
## Proline Proline -0.286752227
## Ash Ash 0.002051061
## Magnesium Magnesium -0.141992042
## Hue Hue -0.296714564
## Malic.acid Malic.acid 0.245187580
## OD280.OD315.of.diluted.wines OD280.OD315.of.diluted.wines -0.376167411
## Total.phenols Total.phenols -0.394660845
## Proanthocyanins Proanthocyanins -0.313429488
## PC2_Loading PC1_Contribution PC2_Contribution
## Color.intensity -0.52999567 8.8616705 52.999567
## Alcohol -0.48365155 14.4329395 48.365155
## Proline -0.36490283 28.6752227 36.490283
## Ash -0.31606881 0.2051061 31.606881
## Magnesium -0.29963400 14.1992042 29.963400
## Hue 0.27923515 29.6714564 27.923515
## Malic.acid -0.22493093 24.5187580 22.493093
## OD280.OD315.of.diluted.wines 0.16449619 37.6167411 16.449619
## Total.phenols -0.06503951 39.4660845 6.503951
## Proanthocyanins -0.03930172 31.3429488 3.930172
# Create a heatmap of loadings for the first 4 principal components
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggplot2)
# Extract first 4 PCs
loadings_first4 <- loadings[, 1:4]
# Prepare data for heatmap
loadings_melted <- melt(loadings_first4)
colnames(loadings_melted) <- c("Variable", "PC", "Loading")
# Create heatmap
ggplot(loadings_melted, aes(x = PC, y = Variable, fill = Loading)) +
geom_tile(color = "white", linewidth = 0.5) +
scale_fill_gradient2(low = "#2E9FDF", mid = "white",
high = "#FC4E07", midpoint = 0) +
geom_text(aes(label = round(Loading, 2)), color = "black", size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, vjust = 1, hjust = 1)) +
labs(title = "PCA Loadings Heatmap (First 4 Principal Components)",
x = "Principal Component",
y = "Chemical Variable",
fill = "Loading Value")
# Based on loadings, interpret what each PC represents
cat("## Interpretation of Principal Components:\n\n")
## ## Interpretation of Principal Components:
cat("### PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "% of variance):\n")
## ### PC1 ( 36.2 % of variance):
cat("This component is strongly influenced by:\n")
## This component is strongly influenced by:
# Find variables with high absolute loadings (>0.3) for PC1
high_loadings_pc1 <- loadings_df[abs(loadings_df$PC1_Loading) > 0.3, ]
for(i in 1:nrow(high_loadings_pc1)) {
cat("- ", high_loadings_pc1$Variable[i],
" (Loading: ", round(high_loadings_pc1$PC1_Loading[i], 3), ")\n", sep = "")
}
## - Total.phenols (Loading: -0.395)
## - Flavanoids (Loading: -0.423)
## - Proanthocyanins (Loading: -0.313)
## - OD280.OD315.of.diluted.wines (Loading: -0.376)
cat("\n### PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "% of variance):\n")
##
## ### PC2 ( 19.2 % of variance):
cat("This component is strongly influenced by:\n")
## This component is strongly influenced by:
# Find variables with high absolute loadings (>0.3) for PC2
high_loadings_pc2 <- loadings_df[abs(loadings_df$PC2_Loading) > 0.3, ]
for(i in 1:nrow(high_loadings_pc2)) {
cat("- ", high_loadings_pc2$Variable[i],
" (Loading: ", round(high_loadings_pc2$PC2_Loading[i], 3), ")\n", sep = "")
}
## - Alcohol (Loading: -0.484)
## - Ash (Loading: -0.316)
## - Color.intensity (Loading: -0.53)
## - Proline (Loading: -0.365)
# Extract first 2 principal components for clustering
pca_scores <- pca_result$x[, 1:2]
# Determine optimal number of clusters using elbow method
wss <- numeric(10)
for (i in 1:10) {
kmeans_result_temp <- kmeans(pca_scores, centers = i, nstart = 25)
wss[i] <- kmeans_result_temp$tot.withinss
}
# Plot elbow method
elbow_data <- data.frame(Clusters = 1:10, WSS = wss)
ggplot(elbow_data, aes(x = Clusters, y = WSS)) +
geom_line(color = "#2E9FDF", linewidth = 1.2) +
geom_point(color = "#FC4E07", size = 3) +
geom_vline(xintercept = 3, linetype = "dashed", color = "gray") +
theme_minimal() +
labs(title = "Elbow Method for Optimal Number of Clusters",
x = "Number of Clusters",
y = "Within-Cluster Sum of Squares (WSS)") +
scale_x_continuous(breaks = 1:10)
# Perform k-means clustering with 3 clusters
set.seed(123)
kmeans_result <- kmeans(pca_scores, centers = 3, nstart = 25)
# Add cluster assignments to data
cluster_data <- data.frame(
PC1 = pca_scores[, 1],
PC2 = pca_scores[, 2],
KMeans_Cluster = as.factor(kmeans_result$cluster),
True_Class = as.factor(wine_class)
)
# View cluster assignments vs true classes
cat("### Confusion Matrix: K-means Clusters vs True Classes\n")
## ### Confusion Matrix: K-means Clusters vs True Classes
confusion_matrix <- table(KMeans_Cluster = kmeans_result$cluster,
True_Class = wine_class)
print(confusion_matrix)
## True_Class
## KMeans_Cluster 1 2 3
## 1 59 5 0
## 2 0 65 0
## 3 0 1 48
# Create comparison plots
# Plot 1: True Wine Classes
plot_true <- ggplot(cluster_data, aes(x = PC1, y = PC2, color = True_Class)) +
geom_point(size = 3, alpha = 0.8) +
stat_ellipse(level = 0.95, linewidth = 1) +
scale_color_manual(values = c("#2E9FDF", "#FC4E07", "#E7B800")) +
theme_minimal() +
labs(title = "True Wine Classes",
x = paste0("PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "%)"),
y = paste0("PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "%)"),
color = "Wine Class")
# Plot 2: K-means Clusters
plot_clusters <- ggplot(cluster_data, aes(x = PC1, y = PC2, color = KMeans_Cluster)) +
geom_point(size = 3, alpha = 0.8) +
stat_ellipse(level = 0.95, linewidth = 1) +
scale_color_manual(values = c("#2E9FDF", "#FC4E07", "#E7B800")) +
theme_minimal() +
labs(title = "K-means Clustering Results (k=3)",
x = paste0("PC1 (", round(eigenvalues_df$Variance_Explained[1], 1), "%)"),
y = paste0("PC2 (", round(eigenvalues_df$Variance_Explained[2], 1), "%)"),
color = "Cluster")
# Arrange plots side by side
grid.arrange(plot_true, plot_clusters, ncol = 2)
# Calculate performance metrics
ari <- adjustedRandIndex(as.numeric(cluster_data$True_Class),
as.numeric(cluster_data$KMeans_Cluster))
purity <- sum(apply(confusion_matrix, 2, max)) / nrow(cluster_data)
cat("\n### Clustering Performance Metrics:\n")
##
## ### Clustering Performance Metrics:
cat("Adjusted Rand Index (ARI):", round(ari, 4), "\n")
## Adjusted Rand Index (ARI): 0.8951
cat("Purity:", round(purity, 4), "(", round(purity * 100, 1), "% accuracy)\n")
## Purity: 0.9663 ( 96.6 % accuracy)
PCA successfully reduced the data from 13 dimensions to 2 principal components while retaining 55.4% of the total variance. The first two principal components (PC1 and PC2) explain 55.4% of the variance, providing an effective low-dimensional representation.
The PCA visualization clearly separates the three wine cultivars in the reduced 2D space. Class 1 wines are well-separated from Classes 2 and 3, while some overlap exists between Class 2 and Class 3 wines, suggesting similar chemical profiles.
K-means clustering applied to the PCA-reduced data achieved an Adjusted Rand Index of 0.8951 and purity of 0.9663. This demonstrates that PCA not only visualizes the data structure but also preserves clustering-relevant information.
AI tools were used to assist with debugging R code errors, refining text expression, and suggesting improvements for data visualizations.