## Example 
# ---
# Perform and visualize PCA in the given mtcars dataset 
# ---
# OUR CODE GOES BELOW
# 

# Loading our dataset
# ---
# 
df <- mtcars
head(df)
str(df)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Selecting the numerical data (excluding the categorical variables vs and am)
# ---
# 
df <- mtcars[,c(1:7,10,11)]
head(df)
# We then pass df to the prcomp(). We also set two arguments, center and scale, 
# to be TRUE then preview our object with summary
# ---
# 
mtcars.pca <- prcomp(df, center = TRUE, scale. = TRUE)
summary(mtcars.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3782 1.4429 0.71008 0.51481 0.42797 0.35184 0.32413
## Proportion of Variance 0.6284 0.2313 0.05602 0.02945 0.02035 0.01375 0.01167
## Cumulative Proportion  0.6284 0.8598 0.91581 0.94525 0.96560 0.97936 0.99103
##                           PC8     PC9
## Standard deviation     0.2419 0.14896
## Proportion of Variance 0.0065 0.00247
## Cumulative Proportion  0.9975 1.00000
# As a result we obtain 9 principal components, 
# each which explain a percentate of the total variation of the dataset
# PC1 explains 63% of the total variance, which means that nearly two-thirds 
# of the information in the dataset (9 variables) can be encapsulated 
# by just that one Principal Component. PC2 explains 23% of the variance. etc
# Calling str() to have a look at your PCA object
# ---
# 
str(mtcars.pca)
## List of 5
##  $ sdev    : num [1:9] 2.378 1.443 0.71 0.515 0.428 ...
##  $ rotation: num [1:9, 1:9] -0.393 0.403 0.397 0.367 -0.312 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:9] "mpg" "cyl" "disp" "hp" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:9] 20.09 6.19 230.72 146.69 3.6 ...
##   ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
##  $ scale   : Named num [1:9] 6.027 1.786 123.939 68.563 0.535 ...
##   ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
##  $ x       : num [1:32, 1:9] -0.664 -0.637 -2.3 -0.215 1.587 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"

Here we note that our pca object: The center point (\(center), scaling (\)scale),

standard deviation(sdev) of each principal component.

The relationship (correlation or anticorrelation, etc)

between the initial variables and the principal components (\(rotation). # The values of each sample in terms of the principal components (\)x)

# better understanding of linear transformation we use a biplot
# x axis of biplot rep PC1
biplot.mtcars.pca <- biplot(mtcars.pca)

biplot.mtcars.pca
## NULL
# arm bend reps a decrease in cumulative contribution
plot.mtcars.pca <- plot(mtcars.pca, type="l")

plot.mtcars.pca
## NULL
# We will now plot our pca. This will provide us with some very useful insights i.e. 
# which cars are most similar to each other 
# ---
# 

# Installing our ggbiplot visualisation package
# 
library(devtools)
## Warning: package 'devtools' was built under R version 4.1.3
## Loading required package: usethis
#install_github("vqv/ggbiplot", force = TRUE)
# Then Loading our ggbiplot library
#  
#library(ggbiplot)
#ggbiplot(mtcars.pca)
#install.packages("ggplot2", dependencies=TRUE)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(AMR)
## Warning: package 'AMR' was built under R version 4.1.3
ggplot_pca(mtcars.pca)

# From the graph we will see that the variables hp, cyl and disp contribute to PC1, 
# with higher values in those variables moving the samples to the right on the plot. 
# Adding more detail to the plot, we provide arguments rownames as labels
# 
ggplot_pca(mtcars.pca, labels=rownames(mtcars), obs.scale = 1, var.scale = 1)

# We now see which cars are similar to one another. 
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# We can also look at the origin of each of the cars by putting them 
# into one of three categories i.e. US, Japanese and European cars.
# 
#mtcars.country <- c(rep("Japan", 3), rep("US",4), rep("Europe", 7),rep("US",3), "Europe", rep("Japan", 3), rep("US",4), rep("Europe", 3), "US", rep("Europe", 3))

#ggplot_pca(mtcars.pca,ellipse=TRUE,  labels=rownames(mtcars), groups=mtcars.country, obs.scale = 1, var.scale = 1)

# We get to see that US cars for a cluster on the right. 
# This cluster is characterized by high values for cyl, disp and wt. 
# Japanese cars are characterized by high mpg. 
# European cars are somewhat in the middle and less tightly clustered that either group.
# We now plot PC3 and PC4
ggplot_pca(mtcars.pca,ellipse=TRUE,choices=c(3,4),   labels=rownames(mtcars))

# We find it difficult to derive insights from the given plot mainly because PC3 and PC4 
# explain very small percentages of the total variation, thus it would be surprising 
# if we found that they were very informative and separated the groups or revealed apparent patterns.
# Having performed PCA using this dataset, if we were to build a classification model 
# to identify the origin of a car (i.e. European, Japanese, US), 
# the variables cyl, disp, wt and mpg would be significant variables as seen in our PCA analysis.
## Challenge 1
# ---
# Question: Perform and plot PCA to the give Iris dataset. Reduce 4 dimensinal data into 2 or three dimensions. 
# Provide remarks on your analysis.
# ---
# Dataset url = http://bit.ly/IrisDataset
# ---
# OUR CODE GOES BELOW
# 
iris <- read.csv('http://bit.ly/IrisDataset')
head(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ sepal_width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ petal_length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ species     : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
df1 <- iris[,c(1:4)]
head(df1)
iris.pca <- prcomp(df1, center = TRUE, scale. = TRUE)
summary(iris.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion  0.7277 0.9580 0.99485 1.00000
str(iris.pca)
## List of 5
##  $ sdev    : num [1:4] 1.706 0.96 0.384 0.144
##  $ rotation: num [1:4, 1:4] 0.522 -0.263 0.581 0.566 -0.372 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 5.84 3.05 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##  $ scale   : Named num [1:4] 0.828 0.434 1.764 0.763
##   ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##  $ x       : num [1:150, 1:4] -2.26 -2.08 -2.36 -2.3 -2.38 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"
#iris.pca.plot <- autoplot(iris.pca,
                          #data = iris,
                          #colour = 'Species')
  
#iris.pca.plot
plot.iris.pca <- plot(iris.pca, type="l")

plot.iris.pca
## NULL
ggplot_pca(iris.pca)

# Adding more detail to the plot, we provide arguments rownames as labels
# 
ggplot_pca(iris.pca, labels=rownames(iris), obs.scale = 1, var.scale = 1)

# We now see which cars are similar to one another. 
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# Challenge 2
# ---
# Question: Perform and plot PCA on the given dataset.
# ---
# Dataset url = http://bit.ly/WisconsinDataset
# ---
# OUR CODE GOES BELOW






```r
## Challenge 3
# ---
# Question: Perform and plot the given housing dataset. Provide remarks to your analysis. 
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
# OUR CODE GOES BELOW

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.