R Programming: Principal Component Analysis

Example

## Example 
# ---
# Perform and visualize PCA in the given mtcars dataset 
# ---
# OUR CODE GOES BELOW
# 

# Loading our dataset
# ---
# 
df <- mtcars
head(df)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
# Selecting the numerical data (excluding the categorical variables vs and am)
# ---
# 
df <- mtcars[,c(1:7,10,11)]
head(df)
##                    mpg cyl disp  hp drat    wt  qsec gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22    3    1
# We then pass df to the prcomp(). We also set two arguments, center and scale, 
# to be TRUE then preview our object with summary
# ---
# 
mtcars.pca <- prcomp(mtcars[,c(1:7,10,11)], center = TRUE, scale. = TRUE)
summary(mtcars.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.3782 1.4429 0.71008 0.51481 0.42797 0.35184 0.32413
## Proportion of Variance 0.6284 0.2313 0.05602 0.02945 0.02035 0.01375 0.01167
## Cumulative Proportion  0.6284 0.8598 0.91581 0.94525 0.96560 0.97936 0.99103
##                           PC8     PC9
## Standard deviation     0.2419 0.14896
## Proportion of Variance 0.0065 0.00247
## Cumulative Proportion  0.9975 1.00000
# As a result we obtain 9 principal components, 
# each which explain a percentate of the total variation of the dataset
# PC1 explains 63% of the total variance, which means that nearly two-thirds 
# of the information in the dataset (9 variables) can be encapsulated 
# by just that one Principal Component. PC2 explains 23% of the variance. etc
# Calling str() to have a look at your PCA object
# ---
# 
str(mtcars.pca)
## List of 5
##  $ sdev    : num [1:9] 2.378 1.443 0.71 0.515 0.428 ...
##  $ rotation: num [1:9, 1:9] -0.393 0.403 0.397 0.367 -0.312 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:9] "mpg" "cyl" "disp" "hp" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:9] 20.09 6.19 230.72 146.69 3.6 ...
##   ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
##  $ scale   : Named num [1:9] 6.027 1.786 123.939 68.563 0.535 ...
##   ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
##  $ x       : num [1:32, 1:9] -0.664 -0.637 -2.3 -0.215 1.587 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##   .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
# Here we note that our pca object: The center point ($center), scaling ($scale), 
# standard deviation(sdev) of each principal component. 
# The relationship (correlation or anticorrelation, etc) 
# between the initial variables and the principal components ($rotation). 
# The values of each sample in terms of the principal components ($x)
# We will now plot our pca. This will provide us with some very useful insights i.e. 
# which cars are most similar to each other 
# ---
# 

# Installing our ggbiplot visualisation package
# 
#install.packages("devtools")
library(devtools)
## Loading required package: usethis
## Error in get(genname, envir = envir) : object 'testthat_print' not found
#install.packages("ggbiplot")
#install_github("vqv/ggbiplot",ref = "experimental")
Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
#library(devtools)
#install_github("vqv/ggbiplot", ref = "experimental")  
library(devtools)
install_github("vqv/ggbiplot")
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
##   Use `force = TRUE` to force installation
# Then Loading our ggbiplot library
#  
#install.packages("ggbiplot")
library(ggbiplot)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
ggbiplot(mtcars.pca)

# From the graph we will see that the variables hp, cyl and disp contribute to PC1, 
# with higher values in those variables moving the samples to the right on the plot. 
# Adding more detail to the plot, we provide arguments rownames as labels
# 
ggbiplot(mtcars.pca, labels=rownames(mtcars), obs.scale = 1, var.scale = 1)

# We now see which cars are similar to one another. 
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# We can also look at the origin of each of the cars by putting them 
# into one of three categories i.e. US, Japanese and European cars.
# 
mtcars.country <- c(rep("Japan", 3), rep("US",4), rep("Europe", 7),rep("US",3), "Europe", rep("Japan", 3), rep("US",4), rep("Europe", 3), "US", rep("Europe", 3))

ggbiplot(mtcars.pca,ellipse=TRUE,  labels=rownames(mtcars), groups=mtcars.country, obs.scale = 1, var.scale = 1)

# We get to see that US cars for a cluster on the right. 
# This cluster is characterized by high values for cyl, disp and wt. 
# Japanese cars are characterized by high mpg. 
# European cars are somewhat in the middle and less tightly clustered that either group.
# We now plot PC3 and PC4
ggbiplot(mtcars.pca,ellipse=TRUE,choices=c(3,4),   labels=rownames(mtcars), groups=mtcars.country)

# We find it difficult to derive insights from the given plot mainly because PC3 and PC4 
# explain very small percentages of the total variation, thus it would be surprising 
# if we found that they were very informative and separated the groups or revealed apparent patterns.
# Having performed PCA using this dataset, if we were to build a classification model 
# to identify the origin of a car (i.e. European, Japanese, US), 
# the variables cyl, disp, wt and mpg would be significant variables as seen in our PCA analysis.

Challenges

## Challenge 1
# ---
# Question: Perform and plot PCA to the give Iris dataset. Reduce 4 dimensinal data into 2 or three dimensions. 
# Provide remarks on your analysis.
# ---
# Dataset url = http://bit.ly/IrisDataset
# ---
# OUR CODE GOES BELOW
# 
library(data.table)
url=("http://bit.ly/IrisDataset")
irisdf<-fread(url)
#viewing the datset
head(irisdf)
##    sepal_length sepal_width petal_length petal_width     species
## 1:          5.1         3.5          1.4         0.2 Iris-setosa
## 2:          4.9         3.0          1.4         0.2 Iris-setosa
## 3:          4.7         3.2          1.3         0.2 Iris-setosa
## 4:          4.6         3.1          1.5         0.2 Iris-setosa
## 5:          5.0         3.6          1.4         0.2 Iris-setosa
## 6:          5.4         3.9          1.7         0.4 Iris-setosa
df <- irisdf[,c(1:4)]
head(df)
##    sepal_length sepal_width petal_length petal_width
## 1:          5.1         3.5          1.4         0.2
## 2:          4.9         3.0          1.4         0.2
## 3:          4.7         3.2          1.3         0.2
## 4:          4.6         3.1          1.5         0.2
## 5:          5.0         3.6          1.4         0.2
## 6:          5.4         3.9          1.7         0.4
# We then pass df to the prcomp(). We also set two arguments, center and scale, 
# to be TRUE then preview our object with summary
# ---
# 
df.pca <- prcomp(df[,c(1:4)], center = TRUE, scale. = TRUE)
summary(df.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion  0.7277 0.9580 0.99485 1.00000
# Calling str() to have a look at your PCA object
# ---
# 
str(df.pca)
## List of 5
##  $ sdev    : num [1:4] 1.706 0.96 0.384 0.144
##  $ rotation: num [1:4, 1:4] 0.522 -0.263 0.581 0.566 -0.372 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 5.84 3.05 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##  $ scale   : Named num [1:4] 0.828 0.434 1.764 0.763
##   ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
##  $ x       : num [1:150, 1:4] -2.26 -2.08 -2.36 -2.3 -2.38 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"
#library(devtools)
library(ggbiplot)
ggbiplot(df.pca)

# Adding more detail to the plot, we provide arguments rownames as labels
# 
ggbiplot(df.pca, labels=rownames(df), obs.scale = 1, var.scale = 1)

## Challenge 2
# ---
# Question: Perform and plot PCA on the given dataset.
# ---
# Dataset url = http://bit.ly/WisconsinDataset
# ---
# OUR CODE GOES BELOW
# 
url =("C:\\Users\\RoySambu\\Downloads\\Data.csv")
wisconsdf<-fread(url)
## Warning in fread(url): Detected 33 column names but the data has 32 columns.
## Filling rows automatically. Set fill=TRUE explicitly to avoid this warning.
head(wisconsdf)
##          id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1:   842302         M       17.99        10.38         122.80    1001.0
## 2:   842517         M       20.57        17.77         132.90    1326.0
## 3: 84300903         M       19.69        21.25         130.00    1203.0
## 4: 84348301         M       11.42        20.38          77.58     386.1
## 5: 84358402         M       20.29        14.34         135.10    1297.0
## 6:   843786         M       12.45        15.70          82.57     477.1
##    smoothness_mean compactness_mean concavity_mean concave points_mean
## 1:         0.11840          0.27760         0.3001             0.14710
## 2:         0.08474          0.07864         0.0869             0.07017
## 3:         0.10960          0.15990         0.1974             0.12790
## 4:         0.14250          0.28390         0.2414             0.10520
## 5:         0.10030          0.13280         0.1980             0.10430
## 6:         0.12780          0.17000         0.1578             0.08089
##    symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1:        0.2419                0.07871    1.0950     0.9053        8.589
## 2:        0.1812                0.05667    0.5435     0.7339        3.398
## 3:        0.2069                0.05999    0.7456     0.7869        4.585
## 4:        0.2597                0.09744    0.4956     1.1560        3.445
## 5:        0.1809                0.05883    0.7572     0.7813        5.438
## 6:        0.2087                0.07613    0.3345     0.8902        2.217
##    area_se smoothness_se compactness_se concavity_se concave points_se
## 1:  153.40      0.006399        0.04904      0.05373           0.01587
## 2:   74.08      0.005225        0.01308      0.01860           0.01340
## 3:   94.03      0.006150        0.04006      0.03832           0.02058
## 4:   27.23      0.009110        0.07458      0.05661           0.01867
## 5:   94.44      0.011490        0.02461      0.05688           0.01885
## 6:   27.19      0.007510        0.03345      0.03672           0.01137
##    symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1:     0.03003             0.006193        25.38         17.33          184.60
## 2:     0.01389             0.003532        24.99         23.41          158.80
## 3:     0.02250             0.004571        23.57         25.53          152.50
## 4:     0.05963             0.009208        14.91         26.50           98.87
## 5:     0.01756             0.005115        22.54         16.67          152.20
## 6:     0.02165             0.005082        15.47         23.75          103.40
##    area_worst smoothness_worst compactness_worst concavity_worst
## 1:     2019.0           0.1622            0.6656          0.7119
## 2:     1956.0           0.1238            0.1866          0.2416
## 3:     1709.0           0.1444            0.4245          0.4504
## 4:      567.7           0.2098            0.8663          0.6869
## 5:     1575.0           0.1374            0.2050          0.4000
## 6:      741.6           0.1791            0.5249          0.5355
##    concave points_worst symmetry_worst fractal_dimension_worst V33
## 1:               0.2654         0.4601                 0.11890  NA
## 2:               0.1860         0.2750                 0.08902  NA
## 3:               0.2430         0.3613                 0.08758  NA
## 4:               0.2575         0.6638                 0.17300  NA
## 5:               0.1625         0.2364                 0.07678  NA
## 6:               0.1741         0.3985                 0.12440  NA
tail(wisconsdf)
##        id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1: 926125         M       20.92        25.09         143.00    1347.0
## 2: 926424         M       21.56        22.39         142.00    1479.0
## 3: 926682         M       20.13        28.25         131.20    1261.0
## 4: 926954         M       16.60        28.08         108.30     858.1
## 5: 927241         M       20.60        29.33         140.10    1265.0
## 6:  92751         B        7.76        24.54          47.92     181.0
##    smoothness_mean compactness_mean concavity_mean concave points_mean
## 1:         0.10990          0.22360        0.31740             0.14740
## 2:         0.11100          0.11590        0.24390             0.13890
## 3:         0.09780          0.10340        0.14400             0.09791
## 4:         0.08455          0.10230        0.09251             0.05302
## 5:         0.11780          0.27700        0.35140             0.15200
## 6:         0.05263          0.04362        0.00000             0.00000
##    symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1:        0.2149                0.06879    0.9622      1.026        8.758
## 2:        0.1726                0.05623    1.1760      1.256        7.673
## 3:        0.1752                0.05533    0.7655      2.463        5.203
## 4:        0.1590                0.05648    0.4564      1.075        3.425
## 5:        0.2397                0.07016    0.7260      1.595        5.772
## 6:        0.1587                0.05884    0.3857      1.428        2.548
##    area_se smoothness_se compactness_se concavity_se concave points_se
## 1:  118.80      0.006399        0.04310      0.07845           0.02624
## 2:  158.70      0.010300        0.02891      0.05198           0.02454
## 3:   99.04      0.005769        0.02423      0.03950           0.01678
## 4:   48.55      0.005903        0.03731      0.04730           0.01557
## 5:   86.22      0.006522        0.06158      0.07117           0.01664
## 6:   19.15      0.007189        0.00466      0.00000           0.00000
##    symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1:     0.02057             0.006213       24.290         29.41          179.10
## 2:     0.01114             0.004239       25.450         26.40          166.10
## 3:     0.01898             0.002498       23.690         38.25          155.00
## 4:     0.01318             0.003892       18.980         34.12          126.70
## 5:     0.02324             0.006185       25.740         39.42          184.60
## 6:     0.02676             0.002783        9.456         30.37           59.16
##    area_worst smoothness_worst compactness_worst concavity_worst
## 1:     1819.0          0.14070           0.41860          0.6599
## 2:     2027.0          0.14100           0.21130          0.4107
## 3:     1731.0          0.11660           0.19220          0.3215
## 4:     1124.0          0.11390           0.30940          0.3403
## 5:     1821.0          0.16500           0.86810          0.9387
## 6:      268.6          0.08996           0.06444          0.0000
##    concave points_worst symmetry_worst fractal_dimension_worst V33
## 1:               0.2542         0.2929                 0.09873  NA
## 2:               0.2216         0.2060                 0.07115  NA
## 3:               0.1628         0.2572                 0.06637  NA
## 4:               0.1418         0.2218                 0.07820  NA
## 5:               0.2650         0.4087                 0.12400  NA
## 6:               0.0000         0.2871                 0.07039  NA
df1.pca <- prcomp(wisconsdf[,c(3:32)], center = TRUE, scale. = TRUE)
summary(df1.pca)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion  0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
##                            PC8    PC9    PC10   PC11    PC12    PC13    PC14
## Standard deviation     0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion  0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
##                           PC15    PC16    PC17    PC18    PC19    PC20   PC21
## Standard deviation     0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion  0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
##                           PC22    PC23   PC24    PC25    PC26    PC27    PC28
## Standard deviation     0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion  0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
##                           PC29    PC30
## Standard deviation     0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion  1.00000 1.00000
str(df1.pca)
## List of 5
##  $ sdev    : num [1:30] 3.64 2.39 1.68 1.41 1.28 ...
##  $ rotation: num [1:30, 1:30] -0.219 -0.104 -0.228 -0.221 -0.143 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
##   .. ..$ : chr [1:30] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:30] 14.1273 19.2896 91.969 654.8891 0.0964 ...
##   ..- attr(*, "names")= chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
##  $ scale   : Named num [1:30] 3.524 4.301 24.299 351.9141 0.0141 ...
##   ..- attr(*, "names")= chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
##  $ x       : num [1:569, 1:30] -9.18 -2.39 -5.73 -7.12 -3.93 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:30] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
ggbiplot(df1.pca)

ggbiplot(df1.pca, labels=rownames(wisconsdf), obs.scale = 1, var.scale = 1)

## Challenge {3}
# ---
# Question: Perform and plot the given housing dataset. Provide remarks to your analysis. 
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
# OUR CODE GOES BELOW
#