## Example
# ---
# Perform and visualize PCA in the given mtcars dataset
# ---
# OUR CODE GOES BELOW
#
# Loading our dataset
# ---
#
df <- mtcars
head(df)
str(df)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Selecting the numerical data (excluding the categorical variables vs and am)
# ---
#
df <- mtcars[,c(1:7,10,11)]
head(df)
# We then pass df to the prcomp(). We also set two arguments, center and scale,
# to be TRUE then preview our object with summary
# ---
#
mtcars.pca <- prcomp(df, center = TRUE, scale. = TRUE)
summary(mtcars.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3782 1.4429 0.71008 0.51481 0.42797 0.35184 0.32413
## Proportion of Variance 0.6284 0.2313 0.05602 0.02945 0.02035 0.01375 0.01167
## Cumulative Proportion 0.6284 0.8598 0.91581 0.94525 0.96560 0.97936 0.99103
## PC8 PC9
## Standard deviation 0.2419 0.14896
## Proportion of Variance 0.0065 0.00247
## Cumulative Proportion 0.9975 1.00000
# As a result we obtain 9 principal components,
# each which explain a percentate of the total variation of the dataset
# PC1 explains 63% of the total variance, which means that nearly two-thirds
# of the information in the dataset (9 variables) can be encapsulated
# by just that one Principal Component. PC2 explains 23% of the variance. etc
# Calling str() to have a look at your PCA object
# ---
#
str(mtcars.pca)
## List of 5
## $ sdev : num [1:9] 2.378 1.443 0.71 0.515 0.428 ...
## $ rotation: num [1:9, 1:9] -0.393 0.403 0.397 0.367 -0.312 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:9] "mpg" "cyl" "disp" "hp" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:9] 20.09 6.19 230.72 146.69 3.6 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ scale : Named num [1:9] 6.027 1.786 123.939 68.563 0.535 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ x : num [1:32, 1:9] -0.664 -0.637 -2.3 -0.215 1.587 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
# better understanding of linear transformation we use a biplot
# x axis of biplot rep PC1
biplot.mtcars.pca <- biplot(mtcars.pca)
biplot.mtcars.pca
## NULL
# arm bend reps a decrease in cumulative contribution
plot.mtcars.pca <- plot(mtcars.pca, type="l")
plot.mtcars.pca
## NULL
# We will now plot our pca. This will provide us with some very useful insights i.e.
# which cars are most similar to each other
# ---
#
# Installing our ggbiplot visualisation package
#
library(devtools)
## Warning: package 'devtools' was built under R version 4.1.3
## Loading required package: usethis
#install_github("vqv/ggbiplot", force = TRUE)
# Then Loading our ggbiplot library
#
#library(ggbiplot)
#ggbiplot(mtcars.pca)
#install.packages("ggplot2", dependencies=TRUE)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(AMR)
## Warning: package 'AMR' was built under R version 4.1.3
ggplot_pca(mtcars.pca)
# From the graph we will see that the variables hp, cyl and disp contribute to PC1,
# with higher values in those variables moving the samples to the right on the plot.
# Adding more detail to the plot, we provide arguments rownames as labels
#
ggplot_pca(mtcars.pca, labels=rownames(mtcars), obs.scale = 1, var.scale = 1)
# We now see which cars are similar to one another.
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# We can also look at the origin of each of the cars by putting them
# into one of three categories i.e. US, Japanese and European cars.
#
#mtcars.country <- c(rep("Japan", 3), rep("US",4), rep("Europe", 7),rep("US",3), "Europe", rep("Japan", 3), rep("US",4), rep("Europe", 3), "US", rep("Europe", 3))
#ggplot_pca(mtcars.pca,ellipse=TRUE, labels=rownames(mtcars), groups=mtcars.country, obs.scale = 1, var.scale = 1)
# We get to see that US cars for a cluster on the right.
# This cluster is characterized by high values for cyl, disp and wt.
# Japanese cars are characterized by high mpg.
# European cars are somewhat in the middle and less tightly clustered that either group.
# We now plot PC3 and PC4
ggplot_pca(mtcars.pca,ellipse=TRUE,choices=c(3,4), labels=rownames(mtcars))
# We find it difficult to derive insights from the given plot mainly because PC3 and PC4
# explain very small percentages of the total variation, thus it would be surprising
# if we found that they were very informative and separated the groups or revealed apparent patterns.
# Having performed PCA using this dataset, if we were to build a classification model
# to identify the origin of a car (i.e. European, Japanese, US),
# the variables cyl, disp, wt and mpg would be significant variables as seen in our PCA analysis.
## Challenge 1
# ---
# Question: Perform and plot PCA to the give Iris dataset. Reduce 4 dimensinal data into 2 or three dimensions.
# Provide remarks on your analysis.
# ---
# Dataset url = http://bit.ly/IrisDataset
# ---
# OUR CODE GOES BELOW
#
iris <- read.csv('http://bit.ly/IrisDataset')
head(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ sepal_length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ petal_length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
df1 <- iris[,c(1:4)]
head(df1)
iris.pca <- prcomp(df1, center = TRUE, scale. = TRUE)
summary(iris.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion 0.7277 0.9580 0.99485 1.00000
str(iris.pca)
## List of 5
## $ sdev : num [1:4] 1.706 0.96 0.384 0.144
## $ rotation: num [1:4, 1:4] 0.522 -0.263 0.581 0.566 -0.372 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 5.84 3.05 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ scale : Named num [1:4] 0.828 0.434 1.764 0.763
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ x : num [1:150, 1:4] -2.26 -2.08 -2.36 -2.3 -2.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
#iris.pca.plot <- autoplot(iris.pca,
#data = iris,
#colour = 'Species')
#iris.pca.plot
plot.iris.pca <- plot(iris.pca, type="l")
plot.iris.pca
## NULL
ggplot_pca(iris.pca)
# Adding more detail to the plot, we provide arguments rownames as labels
#
ggplot_pca(iris.pca, labels=rownames(iris), obs.scale = 1, var.scale = 1)
# We now see which cars are similar to one another.
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# Challenge 2
# ---
# Question: Perform and plot PCA on the given dataset.
# ---
# Dataset url = http://bit.ly/WisconsinDataset
# ---
# OUR CODE GOES BELOW
```r
## Challenge 3
# ---
# Question: Perform and plot the given housing dataset. Provide remarks to your analysis.
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
# OUR CODE GOES BELOW
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.