knitr::opts_chunk$set(echo = TRUE)
library(ggcorrplot)
## Loading required package: ggplot2
library(FactoMineR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
pizza <- read.csv("pizza.csv")
The dataset relates to ingredients in different brands of pizzas. It has 300 observations across 9 variables. 7 out of 9 variables are numeric, while 2 are of chr and int type.
For column ‘id’, it currently has datatype int, however given its nature, chr is more appropriate.
str(pizza)
## 'data.frame': 300 obs. of 9 variables:
## $ brand : chr "A" "A" "A" "A" ...
## $ id : int 14069 14053 14025 14016 14005 14075 14082 14097 14117 14133 ...
## $ mois : num 27.8 28.5 28.4 30.6 30.5 ...
## $ prot : num 21.4 21.3 20 20.1 21.3 ...
## $ fat : num 44.9 43.9 45.8 43.1 41.6 ...
## $ ash : num 5.11 5.34 5.08 4.79 4.82 4.92 4.71 5.28 5.02 5.16 ...
## $ sodium: num 1.77 1.79 1.63 1.61 1.64 1.65 1.58 1.75 1.71 1.66 ...
## $ carb : num 0.77 1.02 0.8 1.38 1.76 1.4 1.77 2.95 1.18 0.64 ...
## $ cal : num 4.93 4.84 4.95 4.74 4.67 4.67 4.63 4.72 4.93 4.95 ...
pizza<- pizza %>%
mutate(id = as.character(id))
We don’t have any missing values in the dataset, which is good news, as the presence of missing values can potentially bias the PCA.
colSums(is.na(pizza))
## brand id mois prot fat ash sodium carb cal
## 0 0 0 0 0 0 0 0 0
Data normalization is required as predictors may have different scales. It may or may not be the case for our pizza case, but it is safer to normalize.
#Select numerical columns only
pizza_num <- pizza %>%
select_if(is.numeric)
#scale the variables
pizza_num_scaled <- scale(pizza_num)
#view the first view rows
head(pizza_num_scaled)
## mois prot fat ash sodium carb cal
## [1,] -1.369526 1.252089 2.745255 1.950635 2.971721 -1.225463 2.675659
## [2,] -1.299391 1.225669 2.636070 2.131776 3.025723 -1.211598 2.530505
## [3,] -1.314046 1.028292 2.846640 1.927007 2.593708 -1.223800 2.707915
## [4,] -1.083752 1.053158 2.551397 1.698611 2.539707 -1.191630 2.369224
## [5,] -1.090033 1.228777 2.386506 1.722238 2.620709 -1.170554 2.256327
## [6,] -1.021991 1.065591 2.460039 1.800996 2.647710 -1.190521 2.256327
pizza_cor <- cor(pizza_num_scaled)
ggcorrplot(pizza_cor)
Insights:
pizza_pca <- prcomp(pizza_num_scaled)
summary(pizza_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.042 1.5134 0.64387 0.3085 0.16636 0.01837 0.003085
## Proportion of Variance 0.596 0.3272 0.05922 0.0136 0.00395 0.00005 0.000000
## Cumulative Proportion 0.596 0.9232 0.98240 0.9960 0.99995 1.00000 1.000000
Insights:
Accessing the rotation attribute gives us the values of our eigenvectors. Below are the eigenvectors for PC1 and PC2
pizza_pca$rotation[, 1:2]
## PC1 PC2
## mois 0.06470937 -0.6282759
## prot 0.37876090 -0.2697067
## fat 0.44666592 0.2343791
## ash 0.47188953 -0.1109904
## sodium 0.43570289 0.2016617
## carb -0.42491371 0.3203121
## cal 0.24448730 0.5674576
Insights:
The calculation below demonstrates how the* X attribute (accessible with pizza_pca$x or also called the PCA scores) for moist in PC1 is calculated.
Note: Eigenvectors are like coefficients in linear equation while moist, protein etc refers to original (scaled) value of each variable > pizza_pca$x[1,1] = 0.06470937 * moist + 0.37876090 * protein + 0.44666592 * fat + …
#The below are the values of 'moist', 'protein', and 'fat' that should go to equation above
pizza_num_scaled[1,]
## mois prot fat ash sodium carb cal
## -1.369526 1.252089 2.745255 1.950635 2.971721 -1.225463 2.675659
The value of pizza_pca$x[1.1] can be calculated using dot multiplication.
sum(pizza_num_scaled[1,] * pizza_pca$rotation[, 1])
## [1] 5.001985
This gives the same result as the resulting X attribute for pizza_pca$x[1,1]
pizza_pca$x[1,1]
## PC1
## 5.001985
Meanwhile, when we have PCA score (X) and eigenvalues, we can also reconstruct the matrices back to the original data to which we originally applied prcomp() to. Recall that our data was scaled, so in order to get the original data:
Let’s try with the first row and the first column of our data, which is moist:
reconstructed_data <- (pizza_pca$x %*% t(pizza_pca$rotation))[1,1]
reconstructed_data
## mois
## -1.369526
Note that this is the same as the value of pizza_num_scaled[1,1], which is the scaled data we fed into prcomp()
pizza_num_scaled[1,1]
## mois
## -1.369526
Next, we unscale the data to give back the same results as pizza_num
# unscale the data
reconstructed_data <- reconstructed_data * sd(pizza_num[,1]) + mean(pizza_num[,1])
reconstructed_data
## mois
## 27.82
Note that the above result is the same as the original data at the start of this report.
sd_moist <- sd(pizza_num[,1])
mean_moist <- mean(pizza_num[,1])
pizza_num_scaled[1,1] * sd_moist + mean_moist
## mois
## 27.82
pizza[1,3]
## [1] 27.82
fviz_pca_var(pizza_pca, col.var = "cos2",
gradient.cols = c("black", "orange", "green"),
repel = TRUE)
Insights:
fviz_contrib(
X = pizza_pca, #
choice = "var",
axes = 1
)
fviz_contrib(
X = pizza_pca, #
choice = "var",
axes = 2
)
pizza_pca_2 <- PCA(X=pizza_cor,
scale.unit = T ,
graph = F,
ncp=7)
Visualization using factomineR is often more informative, as with habillage we can specify what additional information we want summarised into the plot. For example, I want to see the PC plots with color grading with respect to caloric content.
plot.PCA(x = pizza_pca_2,
choix = "ind",
select ='contrib 7', #we have 7 PCs
habillage = "cal")
We can conclude from the plot above that high carb, fat, sodium and ash pizzas are usually pizzas with high calories, while pizzas with high moisture content are usually pizzas with lower calories. In terms of contribution to the PC, both carb and moist are the two largest contributors in our dataset.
We have previously decided that PC1 and PC2 would be sufficient for our purpose, which is clear even without the elbow plot as we can see that variance increase at incrementally lower and lower rate as we add more PCs.
As such we will subset the PCA score to only include column 1 to 2.
pizza_data <- data.frame(pizza_pca$x[, 1:2])
head(pizza_data)
## PC1 PC2
## 1 5.001985 2.674746
## 2 5.015375 2.525076
## 3 4.797424 2.669240
## 4 4.462088 2.281218
## 5 4.464433 2.155551
## 6 4.497286 2.164357
To choose an optimal-K, we will assess the optimal number of clusters using fviz_nclust. Note that code is showing an error during knit process.
#fviz_nbclust(pizza_data, FUNcluster = pam, method = "wss")
knitr::include_graphics("cluster screenshot.png")
It appears that 4 would have been the appropriate number of clusters.
library(stats)
k <- 4
pizza_cluster <- kmeans(pizza_data, centers=k)
Next, we join the kmeans cluster result data to original dataset for evaluation and visualization purposes.
pizza_data$cluster <- pizza_cluster$cluster
head(pizza_data)
## PC1 PC2 cluster
## 1 5.001985 2.674746 4
## 2 5.015375 2.525076 4
## 3 4.797424 2.669240 4
## 4 4.462088 2.281218 4
## 5 4.464433 2.155551 4
## 6 4.497286 2.164357 4
Lets subset for the predicted value for each of the 10 classes.
pred_value <- pizza_data %>%
group_by(cluster) %>%
summarise(frequency = n(),
proportion_of_total = round((frequency / nrow(pizza_data)) * 100, 1))
head(pred_value)
## # A tibble: 4 × 3
## cluster frequency proportion_of_total
## <int> <int> <dbl>
## 1 1 149 49.7
## 2 2 68 22.7
## 3 3 54 18
## 4 4 29 9.7
fviz_cluster(object = pizza_cluster,
data= pizza_data %>% select(-cluster))
pizza_cluster$betweenss / pizza_cluster$tot.withinss
## [1] 5.526671