Part 4 Exercise 1: Principal Component Analysis, PCA
1 Importing and checking multivariate dataset
- Abdi and Williams (2010)
Multistat2txt <- read.table("Multistat2.txt", header = TRUE, sep = "\t", na.strings = "NA",
dec = ",", strip.white = TRUE)
str(Multistat2txt)'data.frame': 101 obs. of 15 variables:
$ country : Factor w/ 101 levels "Afghanistan",..: 17 43 53 64 89 90 100 2 5 11 ...
$ populatn: int 10000 5800 125500 23100 20944 59400 73100 33900 8000 10100 ...
$ density : num 55 5494 330 189 582 ...
$ urban : int 12 94 77 60 71 22 20 86 58 96 ...
$ religion: Factor w/ 7 levels "Buddhist","Catholic",..: 1 1 1 1 1 1 1 2 2 2 ...
$ lifeexpf: int 52 80 82 73 78 72 68 75 79 79 ...
$ lifeexpm: int 50 75 76 67 72 65 63 68 73 73 ...
$ literacy: int 35 77 99 99 91 93 88 95 99 99 ...
$ pop_incr: num 2.9 -0.1 0.3 1.8 0.9 1.4 1.8 1.3 0.2 0.2 ...
$ babymort: num 112 5.8 4.4 27.7 5.1 37 46 25.6 6.7 7.2 ...
$ gdp_cap : int 260 14641 19860 1000 7055 1800 230 3408 18396 17912 ...
$ birth_rt: num 45 13 11 24 15.6 19 27 20 12 12 ...
$ death_rt: int 16 6 7 6 NA 6 8 9 11 11 ...
$ aids_rt : num 0 1.71 0.57 0 NA ...
$ fertilty: num 5.8 1.4 1.6 2.4 NA 2.1 3.3 2.8 1.5 1.7 ...
1.1 Removing any missing data
1.2 Perform a Principal Component Analysis, PCA
Dataset = Multistat2txt
PCA.model <- princomp(~aids_rt + babymort + birth_rt + death_rt + density + fertilty +
gdp_cap + lifeexpf + lifeexpm + literacy + pop_incr + populatn + urban, cor = TRUE,
data = Dataset)
summary(PCA.model)Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
Standard deviation 2.7087616 1.2911281 1.04105734 1.00592346 0.87247877
Proportion of Variance 0.5644146 0.1282317 0.08336926 0.07783708 0.05855532
Cumulative Proportion 0.5644146 0.6926463 0.77601552 0.85385260 0.91240793
Comp.6 Comp.7 Comp.8 Comp.9
Standard deviation 0.72822873 0.55376084 0.336685607 0.287102064
Proportion of Variance 0.04079362 0.02358854 0.008719784 0.006340584
Cumulative Proportion 0.95320155 0.97679009 0.985509875 0.991850460
Comp.10 Comp.11 Comp.12 Comp.13
Standard deviation 0.230213455 0.178530247 0.114810302 0.0888331853
Proportion of Variance 0.004076787 0.002451773 0.001013954 0.0006070258
Cumulative Proportion 0.995927247 0.998379020 0.999392974 1.0000000000
[1] "sdev" "loadings" "center" "scale" "n.obs" "scores" "call"
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
7.337389596 1.667011828 1.083800395 1.011882011 0.761219201 0.530317084
Comp.7 Comp.8 Comp.9 Comp.10 Comp.11 Comp.12
0.306651067 0.113357198 0.082427595 0.052998235 0.031873049 0.013181405
Comp.13
0.007891335
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
aids_rt 0.13018688 0.48914305 0.27601545 0.05676051 0.56653684
babymort 0.35512821 0.04183453 -0.08010393 0.02187439 -0.09389843
birth_rt 0.34444753 -0.20250179 0.11569201 0.08111067 0.14724487
death_rt 0.21052993 0.57337135 0.05810903 -0.02092011 -0.15897720
density -0.05118745 0.04934190 -0.33818637 0.90739709 -0.05213367
fertilty 0.33899317 -0.17046116 0.11475473 0.07634149 0.20870936
gdp_cap -0.25772716 0.23925131 0.08014020 0.13546795 0.41923034
lifeexpf -0.35961662 -0.12737653 0.04114621 -0.02885970 0.03198135
lifeexpm -0.34767386 -0.18546162 0.02648132 0.01290399 0.10879400
Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
aids_rt 0.5126530216 0.008792292 0.18974818 0.185626226 0.05817629
babymort -0.1696640839 -0.093564129 0.31877203 -0.234934568 0.46764678
birth_rt -0.0053077561 0.010694398 -0.29504817 -0.003399435 0.32376006
death_rt -0.3453519055 -0.167871372 -0.39179925 0.149435735 -0.41460896
density 0.1707986494 0.003345836 -0.10231676 -0.080940253 -0.06806686
fertilty -0.1121009845 0.041349798 -0.56735134 0.143399792 0.24814347
gdp_cap -0.6021500068 0.426998407 0.07833446 -0.309830173 0.10780668
lifeexpf 0.0226901250 0.136870120 -0.13955245 0.252533799 0.01872426
lifeexpm -0.0004902739 0.230633064 -0.16059880 0.410727446 0.09467751
Comp.11 Comp.12 Comp.13
aids_rt 0.05431066 0.05381397 0.028951975
babymort 0.57346591 0.32979656 0.075824495
birth_rt 0.12369564 -0.72838633 0.246635220
death_rt 0.31270725 -0.03853288 0.058109857
density 0.05088025 0.02497325 0.020948326
fertilty -0.28876310 0.52391304 -0.131524492
gdp_cap -0.11362852 -0.06474920 0.006160408
lifeexpf 0.19057656 0.24624364 0.806907663
lifeexpm 0.56407348 -0.05164344 -0.504511980
[ reached getOption("max.print") -- omitted 4 rows ]
Dataset_PC <<- within(Dataset, {
PC4 <- PCA.model$scores[, 4]
PC3 <- PCA.model$scores[, 3]
PC2 <- PCA.model$scores[, 2]
PC1 <- PCA.model$scores[, 1]
})p <- ggplot(Dataset_PC) + aes(x = PC1, y = PC2, colour = religion, label = Dataset$country) +
geom_point(size = 2L) + scale_color_hue() + theme_light()
p + geom_text(vjust = -1, size = 2)library(ggfortify)
autoplot(PCA.model, data = Dataset, colour = "religion") + theme_light() + scale_colour_discrete("Religion")p3 <- autoplot(PCA.model, data = Dataset, colour = "religion", loadings = TRUE, loadings.colour = "blue",
loadings.label = TRUE, loadings.label.size = 3)
p3 + theme_light() + scale_colour_discrete("Religion") + geom_text(label = Dataset$country,
vjust = -1, size = 2)library(gridExtra)
p4 <- ggplot(Dataset_PC) + aes(x = babymort, y = PC1, colour = religion) + geom_point(size = 2L) +
scale_color_hue() + theme_light() + theme(legend.position = "none")
p5 <- ggplot(Dataset_PC) + aes(x = death_rt, y = PC2, colour = religion) + geom_point(size = 2L) +
scale_color_hue() + theme_light() + theme(legend.position = "none")
grid.arrange(p4, p5, ncol = 2)1.3 More examples of PCA using other functions
library(factoextra)
library(FactoMineR)
df <- subset(Dataset, select = -c(country, religion))
PCA.model2 <- PCA(df, graph = FALSE)
get_eig(PCA.model2) eigenvalue variance.percent cumulative.variance.percent
Dim.1 7.337389596 56.44145843 56.44146
Dim.2 1.667011828 12.82316791 69.26463
Dim.3 1.083800395 8.33692612 77.60155
Dim.4 1.011882011 7.78370778 85.38526
Dim.5 0.761219201 5.85553232 91.24079
Dim.6 0.530317084 4.07936219 95.32015
Dim.7 0.306651067 2.35885436 97.67901
Dim.8 0.113357198 0.87197845 98.55099
Dim.9 0.082427595 0.63405842 99.18505
Dim.10 0.052998235 0.40767873 99.59272
Dim.11 0.031873049 0.24517730 99.83790
Dim.12 0.013181405 0.10139543 99.93930
Dim.13 0.007891335 0.06070258 100.00000
# Standard PCA plot and biplot
p6 <- fviz_pca_ind(PCA.model2)
p7 <- fviz_pca_biplot(PCA.model2)
grid.arrange(p6, p7, nrow = 1)# Other way of showing contributions of variables to PCs
fviz_pca_var(PCA.model2, col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800",
"#FC4E07"), repel = TRUE)# change to addEllipses=TRUE if you want to add ellipses around the groups. Since
# religion did not group the countries that well in this PCA, we chose to leave
# the ellipses out.
fviz_pca_biplot(PCA.model, label = "var", habillage = Dataset$religion, addEllipses = FALSE,
ellipse.level = 0.95)Reference
Abdi, Hervé, and Lynne J Williams. 2010. “Principal Component Analysis.” Wiley Interdisciplinary Reviews: Computational Statistics 2 (4): 433–59. https://doi.org/10.1002/wics.101.