Example
## Example
# ---
# Perform and visualize PCA in the given mtcars dataset
# ---
# OUR CODE GOES BELOW
#
# Loading our dataset
# ---
#
df <- mtcars
head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# Selecting the numerical data (excluding the categorical variables vs and am)
# ---
#
df <- mtcars[,c(1:7,10,11)]
head(df)
## mpg cyl disp hp drat wt qsec gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 3 1
# We then pass df to the prcomp(). We also set two arguments, center and scale,
# to be TRUE then preview our object with summary
# ---
#
mtcars.pca <- prcomp(mtcars[,c(1:7,10,11)], center = TRUE, scale. = TRUE)
summary(mtcars.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3782 1.4429 0.71008 0.51481 0.42797 0.35184 0.32413
## Proportion of Variance 0.6284 0.2313 0.05602 0.02945 0.02035 0.01375 0.01167
## Cumulative Proportion 0.6284 0.8598 0.91581 0.94525 0.96560 0.97936 0.99103
## PC8 PC9
## Standard deviation 0.2419 0.14896
## Proportion of Variance 0.0065 0.00247
## Cumulative Proportion 0.9975 1.00000
# As a result we obtain 9 principal components,
# each which explain a percentate of the total variation of the dataset
# PC1 explains 63% of the total variance, which means that nearly two-thirds
# of the information in the dataset (9 variables) can be encapsulated
# by just that one Principal Component. PC2 explains 23% of the variance. etc
# Calling str() to have a look at your PCA object
# ---
#
str(mtcars.pca)
## List of 5
## $ sdev : num [1:9] 2.378 1.443 0.71 0.515 0.428 ...
## $ rotation: num [1:9, 1:9] -0.393 0.403 0.397 0.367 -0.312 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:9] "mpg" "cyl" "disp" "hp" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:9] 20.09 6.19 230.72 146.69 3.6 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ scale : Named num [1:9] 6.027 1.786 123.939 68.563 0.535 ...
## ..- attr(*, "names")= chr [1:9] "mpg" "cyl" "disp" "hp" ...
## $ x : num [1:32, 1:9] -0.664 -0.637 -2.3 -0.215 1.587 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## .. ..$ : chr [1:9] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
# Here we note that our pca object: The center point ($center), scaling ($scale),
# standard deviation(sdev) of each principal component.
# The relationship (correlation or anticorrelation, etc)
# between the initial variables and the principal components ($rotation).
# The values of each sample in terms of the principal components ($x)
# We will now plot our pca. This will provide us with some very useful insights i.e.
# which cars are most similar to each other
# ---
#
# Installing our ggbiplot visualisation package
#
#install.packages("devtools")
library(devtools)
## Loading required package: usethis
## Error in get(genname, envir = envir) : object 'testthat_print' not found
#install.packages("ggbiplot")
#install_github("vqv/ggbiplot",ref = "experimental")
Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
#library(devtools)
#install_github("vqv/ggbiplot", ref = "experimental")
library(devtools)
install_github("vqv/ggbiplot")
## Skipping install of 'ggbiplot' from a github remote, the SHA1 (7325e880) has not changed since last install.
## Use `force = TRUE` to force installation
# Then Loading our ggbiplot library
#
#install.packages("ggbiplot")
library(ggbiplot)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
ggbiplot(mtcars.pca)

# From the graph we will see that the variables hp, cyl and disp contribute to PC1,
# with higher values in those variables moving the samples to the right on the plot.
# Adding more detail to the plot, we provide arguments rownames as labels
#
ggbiplot(mtcars.pca, labels=rownames(mtcars), obs.scale = 1, var.scale = 1)

# We now see which cars are similar to one another.
# The sports cars Maserati Bora, Ferrari Dino and Ford Pantera L all cluster together at the top
# We can also look at the origin of each of the cars by putting them
# into one of three categories i.e. US, Japanese and European cars.
#
mtcars.country <- c(rep("Japan", 3), rep("US",4), rep("Europe", 7),rep("US",3), "Europe", rep("Japan", 3), rep("US",4), rep("Europe", 3), "US", rep("Europe", 3))
ggbiplot(mtcars.pca,ellipse=TRUE, labels=rownames(mtcars), groups=mtcars.country, obs.scale = 1, var.scale = 1)

# We get to see that US cars for a cluster on the right.
# This cluster is characterized by high values for cyl, disp and wt.
# Japanese cars are characterized by high mpg.
# European cars are somewhat in the middle and less tightly clustered that either group.
# We now plot PC3 and PC4
ggbiplot(mtcars.pca,ellipse=TRUE,choices=c(3,4), labels=rownames(mtcars), groups=mtcars.country)

# We find it difficult to derive insights from the given plot mainly because PC3 and PC4
# explain very small percentages of the total variation, thus it would be surprising
# if we found that they were very informative and separated the groups or revealed apparent patterns.
# Having performed PCA using this dataset, if we were to build a classification model
# to identify the origin of a car (i.e. European, Japanese, US),
# the variables cyl, disp, wt and mpg would be significant variables as seen in our PCA analysis.
Challenges
## Challenge 1
# ---
# Question: Perform and plot PCA to the give Iris dataset. Reduce 4 dimensinal data into 2 or three dimensions.
# Provide remarks on your analysis.
# ---
# Dataset url = http://bit.ly/IrisDataset
# ---
# OUR CODE GOES BELOW
#
library(data.table)
url=("http://bit.ly/IrisDataset")
irisdf<-fread(url)
#viewing the datset
head(irisdf)
## sepal_length sepal_width petal_length petal_width species
## 1: 5.1 3.5 1.4 0.2 Iris-setosa
## 2: 4.9 3.0 1.4 0.2 Iris-setosa
## 3: 4.7 3.2 1.3 0.2 Iris-setosa
## 4: 4.6 3.1 1.5 0.2 Iris-setosa
## 5: 5.0 3.6 1.4 0.2 Iris-setosa
## 6: 5.4 3.9 1.7 0.4 Iris-setosa
df <- irisdf[,c(1:4)]
head(df)
## sepal_length sepal_width petal_length petal_width
## 1: 5.1 3.5 1.4 0.2
## 2: 4.9 3.0 1.4 0.2
## 3: 4.7 3.2 1.3 0.2
## 4: 4.6 3.1 1.5 0.2
## 5: 5.0 3.6 1.4 0.2
## 6: 5.4 3.9 1.7 0.4
# We then pass df to the prcomp(). We also set two arguments, center and scale,
# to be TRUE then preview our object with summary
# ---
#
df.pca <- prcomp(df[,c(1:4)], center = TRUE, scale. = TRUE)
summary(df.pca)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7061 0.9598 0.38387 0.14355
## Proportion of Variance 0.7277 0.2303 0.03684 0.00515
## Cumulative Proportion 0.7277 0.9580 0.99485 1.00000
# Calling str() to have a look at your PCA object
# ---
#
str(df.pca)
## List of 5
## $ sdev : num [1:4] 1.706 0.96 0.384 0.144
## $ rotation: num [1:4, 1:4] 0.522 -0.263 0.581 0.566 -0.372 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## $ center : Named num [1:4] 5.84 3.05 3.76 1.2
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ scale : Named num [1:4] 0.828 0.434 1.764 0.763
## ..- attr(*, "names")= chr [1:4] "sepal_length" "sepal_width" "petal_length" "petal_width"
## $ x : num [1:150, 1:4] -2.26 -2.08 -2.36 -2.3 -2.38 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
## - attr(*, "class")= chr "prcomp"
#library(devtools)
library(ggbiplot)
ggbiplot(df.pca)

# Adding more detail to the plot, we provide arguments rownames as labels
#
ggbiplot(df.pca, labels=rownames(df), obs.scale = 1, var.scale = 1)

## Challenge 2
# ---
# Question: Perform and plot PCA on the given dataset.
# ---
# Dataset url = http://bit.ly/WisconsinDataset
# ---
# OUR CODE GOES BELOW
#
url =("C:\\Users\\RoySambu\\Downloads\\Data.csv")
wisconsdf<-fread(url)
## Warning in fread(url): Detected 33 column names but the data has 32 columns.
## Filling rows automatically. Set fill=TRUE explicitly to avoid this warning.
head(wisconsdf)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1: 842302 M 17.99 10.38 122.80 1001.0
## 2: 842517 M 20.57 17.77 132.90 1326.0
## 3: 84300903 M 19.69 21.25 130.00 1203.0
## 4: 84348301 M 11.42 20.38 77.58 386.1
## 5: 84358402 M 20.29 14.34 135.10 1297.0
## 6: 843786 M 12.45 15.70 82.57 477.1
## smoothness_mean compactness_mean concavity_mean concave points_mean
## 1: 0.11840 0.27760 0.3001 0.14710
## 2: 0.08474 0.07864 0.0869 0.07017
## 3: 0.10960 0.15990 0.1974 0.12790
## 4: 0.14250 0.28390 0.2414 0.10520
## 5: 0.10030 0.13280 0.1980 0.10430
## 6: 0.12780 0.17000 0.1578 0.08089
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1: 0.2419 0.07871 1.0950 0.9053 8.589
## 2: 0.1812 0.05667 0.5435 0.7339 3.398
## 3: 0.2069 0.05999 0.7456 0.7869 4.585
## 4: 0.2597 0.09744 0.4956 1.1560 3.445
## 5: 0.1809 0.05883 0.7572 0.7813 5.438
## 6: 0.2087 0.07613 0.3345 0.8902 2.217
## area_se smoothness_se compactness_se concavity_se concave points_se
## 1: 153.40 0.006399 0.04904 0.05373 0.01587
## 2: 74.08 0.005225 0.01308 0.01860 0.01340
## 3: 94.03 0.006150 0.04006 0.03832 0.02058
## 4: 27.23 0.009110 0.07458 0.05661 0.01867
## 5: 94.44 0.011490 0.02461 0.05688 0.01885
## 6: 27.19 0.007510 0.03345 0.03672 0.01137
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1: 0.03003 0.006193 25.38 17.33 184.60
## 2: 0.01389 0.003532 24.99 23.41 158.80
## 3: 0.02250 0.004571 23.57 25.53 152.50
## 4: 0.05963 0.009208 14.91 26.50 98.87
## 5: 0.01756 0.005115 22.54 16.67 152.20
## 6: 0.02165 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1: 2019.0 0.1622 0.6656 0.7119
## 2: 1956.0 0.1238 0.1866 0.2416
## 3: 1709.0 0.1444 0.4245 0.4504
## 4: 567.7 0.2098 0.8663 0.6869
## 5: 1575.0 0.1374 0.2050 0.4000
## 6: 741.6 0.1791 0.5249 0.5355
## concave points_worst symmetry_worst fractal_dimension_worst V33
## 1: 0.2654 0.4601 0.11890 NA
## 2: 0.1860 0.2750 0.08902 NA
## 3: 0.2430 0.3613 0.08758 NA
## 4: 0.2575 0.6638 0.17300 NA
## 5: 0.1625 0.2364 0.07678 NA
## 6: 0.1741 0.3985 0.12440 NA
tail(wisconsdf)
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## 1: 926125 M 20.92 25.09 143.00 1347.0
## 2: 926424 M 21.56 22.39 142.00 1479.0
## 3: 926682 M 20.13 28.25 131.20 1261.0
## 4: 926954 M 16.60 28.08 108.30 858.1
## 5: 927241 M 20.60 29.33 140.10 1265.0
## 6: 92751 B 7.76 24.54 47.92 181.0
## smoothness_mean compactness_mean concavity_mean concave points_mean
## 1: 0.10990 0.22360 0.31740 0.14740
## 2: 0.11100 0.11590 0.24390 0.13890
## 3: 0.09780 0.10340 0.14400 0.09791
## 4: 0.08455 0.10230 0.09251 0.05302
## 5: 0.11780 0.27700 0.35140 0.15200
## 6: 0.05263 0.04362 0.00000 0.00000
## symmetry_mean fractal_dimension_mean radius_se texture_se perimeter_se
## 1: 0.2149 0.06879 0.9622 1.026 8.758
## 2: 0.1726 0.05623 1.1760 1.256 7.673
## 3: 0.1752 0.05533 0.7655 2.463 5.203
## 4: 0.1590 0.05648 0.4564 1.075 3.425
## 5: 0.2397 0.07016 0.7260 1.595 5.772
## 6: 0.1587 0.05884 0.3857 1.428 2.548
## area_se smoothness_se compactness_se concavity_se concave points_se
## 1: 118.80 0.006399 0.04310 0.07845 0.02624
## 2: 158.70 0.010300 0.02891 0.05198 0.02454
## 3: 99.04 0.005769 0.02423 0.03950 0.01678
## 4: 48.55 0.005903 0.03731 0.04730 0.01557
## 5: 86.22 0.006522 0.06158 0.07117 0.01664
## 6: 19.15 0.007189 0.00466 0.00000 0.00000
## symmetry_se fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1: 0.02057 0.006213 24.290 29.41 179.10
## 2: 0.01114 0.004239 25.450 26.40 166.10
## 3: 0.01898 0.002498 23.690 38.25 155.00
## 4: 0.01318 0.003892 18.980 34.12 126.70
## 5: 0.02324 0.006185 25.740 39.42 184.60
## 6: 0.02676 0.002783 9.456 30.37 59.16
## area_worst smoothness_worst compactness_worst concavity_worst
## 1: 1819.0 0.14070 0.41860 0.6599
## 2: 2027.0 0.14100 0.21130 0.4107
## 3: 1731.0 0.11660 0.19220 0.3215
## 4: 1124.0 0.11390 0.30940 0.3403
## 5: 1821.0 0.16500 0.86810 0.9387
## 6: 268.6 0.08996 0.06444 0.0000
## concave points_worst symmetry_worst fractal_dimension_worst V33
## 1: 0.2542 0.2929 0.09873 NA
## 2: 0.2216 0.2060 0.07115 NA
## 3: 0.1628 0.2572 0.06637 NA
## 4: 0.1418 0.2218 0.07820 NA
## 5: 0.2650 0.4087 0.12400 NA
## 6: 0.0000 0.2871 0.07039 NA
df1.pca <- prcomp(wisconsdf[,c(3:32)], center = TRUE, scale. = TRUE)
summary(df1.pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880 0.82172
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025 0.02251
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759 0.91010
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.69037 0.6457 0.59219 0.5421 0.51104 0.49128 0.39624
## Proportion of Variance 0.01589 0.0139 0.01169 0.0098 0.00871 0.00805 0.00523
## Cumulative Proportion 0.92598 0.9399 0.95157 0.9614 0.97007 0.97812 0.98335
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.30681 0.28260 0.24372 0.22939 0.22244 0.17652 0.1731
## Proportion of Variance 0.00314 0.00266 0.00198 0.00175 0.00165 0.00104 0.0010
## Cumulative Proportion 0.98649 0.98915 0.99113 0.99288 0.99453 0.99557 0.9966
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.16565 0.15602 0.1344 0.12442 0.09043 0.08307 0.03987
## Proportion of Variance 0.00091 0.00081 0.0006 0.00052 0.00027 0.00023 0.00005
## Cumulative Proportion 0.99749 0.99830 0.9989 0.99942 0.99969 0.99992 0.99997
## PC29 PC30
## Standard deviation 0.02736 0.01153
## Proportion of Variance 0.00002 0.00000
## Cumulative Proportion 1.00000 1.00000
str(df1.pca)
## List of 5
## $ sdev : num [1:30] 3.64 2.39 1.68 1.41 1.28 ...
## $ rotation: num [1:30, 1:30] -0.219 -0.104 -0.228 -0.221 -0.143 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
## .. ..$ : chr [1:30] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:30] 14.1273 19.2896 91.969 654.8891 0.0964 ...
## ..- attr(*, "names")= chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
## $ scale : Named num [1:30] 3.524 4.301 24.299 351.9141 0.0141 ...
## ..- attr(*, "names")= chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
## $ x : num [1:569, 1:30] -9.18 -2.39 -5.73 -7.12 -3.93 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:30] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
ggbiplot(df1.pca)

ggbiplot(df1.pca, labels=rownames(wisconsdf), obs.scale = 1, var.scale = 1)

## Challenge {3}
# ---
# Question: Perform and plot the given housing dataset. Provide remarks to your analysis.
# ---
# Dataset url = http://bit.ly/BostonHousingDataset
# ---
# OUR CODE GOES BELOW
#