## Example
# ---
# Perform and visualize PCA in the given mtcars dataset
# ---
# OUR CODE GOES BELOW
#
# Loading our dataset
# ---
#
df <- mtcars
head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Selecting the numerical data (excluding the categorical variables vs and am)
# ---
#
df <- mtcars[,c(1:7,10,11)]
head(df)
## mpg cyl disp hp drat wt qsec gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 3 1
# We then pass df to the prcomp(). We also set two arguments, center and scale,
# to be TRUE then preview our object with summary
# ---
#
mtcars.pca <- prcomp(mtcars[,c(1:7,10,11)], center = TRUE, scale. = TRUE)
summary(mtcars.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3782 1.4429 0.71008 0.51481 0.42797 0.35184 0.32413
## Proportion of Variance 0.6284 0.2313 0.05602 0.02945 0.02035 0.01375 0.01167
## Cumulative Proportion 0.6284 0.8598 0.91581 0.94525 0.96560 0.97936 0.99103
## PC8 PC9
## Standard deviation 0.2419 0.14896
## Proportion of Variance 0.0065 0.00247
## Cumulative Proportion 0.9975 1.00000
# As a result we obtain 9 principal components,
# each which explain a percentate of the total variation of the dataset
# PC1 explains 63% of the total variance, which means that nearly two-thirds
# of the information in the dataset (9 variables) can be encapsulated
# by just that one Principal Component. PC2 explains 23% of the variance. etc
# Calling str() to have a look at your PCA object
# ---
#
str(mtcars.pca)
## List of 5
## $ sdev : num [1:9] 2.378 1.443 0.71 0.515 0.428 ...
## $ rotation: num [1:9, 1:9] -0.393 0.403 0.397 0.367 -0.312 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:9] "mpg" "cyl" "disp" "hp" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:9] 20.09 6.19 230.72 146.69 3.6 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ scale : Named num [1:9] 6.027 1.786 123.939 68.563 0.535 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ x : num [1:32, 1:9] -0.664 -0.637 -2.3 -0.215 1.587 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
# Here we note that our pca object: The center point ($center), scaling ($scale),
# standard deviation(sdev) of each principal component.
# The relationship (correlation or anticorrelation, etc)
# between the initial variables and the principal components ($rotation).
# The values of each sample in terms of the principal components ($x)
# We will now plot our pca. This will provide us with some very useful insights i.e.
# which cars are most similar to each other
# ---
#
# Installing our ggbiplot visualisation package
#
library(devtools)
## Loading required package: usethis
install_github("vqv/ggbiplot")
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
# Then Loading our ggbiplot library
#
library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
ggbiplot(mtcars.pca)
# From the graph we will see that the variables hp, cyl and disp contribute to PC1,
# with higher values in those variables moving the samples to the right on the plot.
# Adding more detail to the plot, we provide arguments rownames as labels
#
ggbiplot(mtcars.pca, labels=rownames(mtcars), obs.scale = 1, var.scale = 1)
# We now see which cars are similar to one another.
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# We can also look at the origin of each of the cars by putting them
# into one of three categories i.e. US, Japanese and European cars.
#
mtcars.country <- c(rep("Japan", 3), rep("US",4), rep("Europe", 7),rep("US",3), "Europe", rep("Japan", 3), rep("US",4), rep("Europe", 3), "US", rep("Europe", 3))
ggbiplot(mtcars.pca,ellipse=TRUE, labels=rownames(mtcars), groups=mtcars.country, obs.scale = 1, var.scale = 1)
# We get to see that US cars for a cluster on the right.
# This cluster is characterized by high values for cyl, disp and wt.
# Japanese cars are characterized by high mpg.
# European cars are somewhat in the middle and less tightly clustered that either group.
# We now plot PC3 and PC4
ggbiplot(mtcars.pca,ellipse=TRUE,choices=c(3,4), labels=rownames(mtcars), groups=mtcars.country)
# We find it difficult to derive insights from the given plot mainly because PC3 and PC4
# explain very small percentages of the total variation, thus it would be surprising
# if we found that they were very informative and separated the groups or revealed apparent patterns.
# Having performed PCA using this dataset, if we were to build a classification model
# to identify the origin of a car (i.e. European, Japanese, US),
# the variables cyl, disp, wt and mpg would be significant variables as seen in our PCA analysis.
## Challenge 1
# ---
# Question: Perform and plot PCA to the give Iris dataset. Reduce 4 dimensinal data into 2 or three dimensions.
# Provide remarks on your analysis.
# ---
# Dataset url = http://bit.ly/IrisDataset
# ---
# OUR CODE GOES BELOW
#
# Loading the dataset
iris <- read.csv('http://bit.ly/IrisDataset')
#preview first 6 rows of the dataset
head(iris)
## sepal_length sepal_width petal_length petal_width species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
# Perform PCA
iris.pca <- prcomp(iris[,c(1:4)], scale.=TRUE, center = TRUE)
summary(iris.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion 0.7277 0.9580 0.99485 1.00000
str(iris.pca)
## List of 5
## $ sdev : num [1:4] 1.706 0.96 0.384 0.144
## $ rotation: num [1:4, 1:4] 0.522 -0.263 0.581 0.566 -0.372 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 5.84 3.05 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ scale : Named num [1:4] 0.828 0.434 1.764 0.763
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ x : num [1:150, 1:4] -2.26 -2.08 -2.36 -2.3 -2.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
# plot results
ggbiplot(iris.pca, ellipse = TRUE, obs.scale = 1, var.scale = 1, groups=c(iris$species))
## Challenge 2
# ---
# Question: Perform and plot PCA on the given dataset.
# ---
# Dataset url = http://bit.ly/WisconsinDataset
# ---
# OUR CODE GOES BELOW
#
# Load the dataset
wiscon <- read.delim('C:/Users/HP 840 G3 i7/Downloads/breast-cancer-wisconsin.data', header=FALSE, sep=',')
# Preview the dataset
head(wiscon)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
## 1 1000025 5 1 1 1 2 1 3 1 1 2
## 2 1002945 5 4 4 5 7 10 3 2 1 2
## 3 1015425 3 1 1 1 2 2 3 1 1 2
## 4 1016277 6 8 8 1 3 4 3 7 1 2
## 5 1017023 4 1 1 3 2 1 3 1 1 2
## 6 1017122 8 10 10 8 7 10 9 7 1 4
colnames(wiscon[1])
## [1] "V1"
# perform pca
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wiscon.pca <- prcomp(select(wiscon,-V7), center=TRUE, scale=TRUE)
summary(wiscon.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4661 0.99859 0.87318 0.73968 0.64081 0.61296 0.54631
## Proportion of Variance 0.6081 0.09972 0.07624 0.05471 0.04106 0.03757 0.02985
## Cumulative Proportion 0.6081 0.70786 0.78411 0.83882 0.87988 0.91746 0.94730
## PC8 PC9 PC10
## Standard deviation 0.51202 0.41785 0.30037
## Proportion of Variance 0.02622 0.01746 0.00902
## Cumulative Proportion 0.97352 0.99098 1.00000
# plot
ggbiplot(wiscon.pca, ellipse=TRUE, groups=c(wiscon$V7), obs.scale = 1 , var.scale = 20)
## Challenge 3
# ---
# Question: Perform and plot pca the given housing dataset. Provide remarks to your analysis.
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
# OUR CODE GOES BELOW
#
# Load the dataset
boston <- read.csv('http://bit.ly/BostonHousingDataset')
# preview the dataset
head(boston)
## crim zn indus chas nox rm age dis rad tax ptratio b lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
crim per capita crime rate by town.
zn proportion of residential land zoned for lots over 25,000 sq.ft.
indus proportion of non-retail business acres per town.
chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
nox nitrogen oxides concentration (parts per 10 million).
rm average number of rooms per dwelling.
age proportion of owner-occupied units built prior to 1940.
dis weighted mean of distances to five Boston employment centres.
rad index of accessibility to radial highways.
tax full-value property-tax rate per $10,000.
ptratio pupil-teacher ratio by town.
black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
lstat lower status of the population (percent).
medv median value of owner-occupied homes in $1000s.
# perform pca
boston.pca <- prcomp(select(boston, -chas), center = TRUE, scale. = TRUE)
# Print summary
summary(boston.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.5585 1.2340 1.1558 0.92952 0.81655 0.73311 0.63533
## Proportion of Variance 0.5035 0.1171 0.1027 0.06646 0.05129 0.04134 0.03105
## Cumulative Proportion 0.5035 0.6207 0.7234 0.78987 0.84116 0.88250 0.91355
## PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 0.52679 0.5034 0.46137 0.4281 0.36875 0.24656
## Proportion of Variance 0.02135 0.0195 0.01637 0.0141 0.01046 0.00468
## Cumulative Proportion 0.93490 0.9544 0.97077 0.9849 0.99532 1.00000
# plot
ggbiplot(boston.pca, ellipse = TRUE, choices=c(1,2), groups = c(boston$chas), obs.scale = 1, var.scale = 1)
We use the first two for visualization. However we would use three variables for other computations since the third PCA variability contribution is just a little less than the first one.