1 - Empirical example
2 - PCA on correlated vs uncorrelated variables
3 - PCA on a correlation matrix (scaled) vs covariance matrix (non-scaled)
library(tidyverse)
library(DT)
library(corrplot)
library(psych)
library(plotly)
library(htmlwidgets)
library(ggbiplot)
library(ggfortify)
require(FactoMineR)
require(factoextra)
Times Higher Education 2021 ranking data
data_raw <- read.csv("THE2021.csv")
str(data_raw)
## 'data.frame': 1448 obs. of 11 variables:
## $ name : chr "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
## $ location : chr "United Kingdom" "United States" "United States" "United States" ...
## $ scores_teaching : num 91.3 92.2 94.4 92.5 90.7 90.3 85.8 91.9 88.8 88.9 ...
## $ scores_research : num 99.6 96.7 98.8 96.9 94.4 99.2 97.2 93.8 92.5 90.5 ...
## $ scores_citations : num 98 99.9 99.4 97 99.7 95.6 99.1 97.9 98.9 98.6 ...
## $ scores_industry_income : num 68.7 90.1 46.8 92.7 90.4 52.1 84.3 56.1 58 54.9 ...
## $ scores_international_outlook: num 96.4 79.5 77.7 83.6 90 95.7 72.3 68.4 80.2 74 ...
## $ stats_number_students : int 20774 16223 21261 2238 11276 19370 39918 12910 8091 14292 ...
## $ stats_student_staff_ratio : num 11.1 7.4 9.3 6.3 8.4 11 19.8 6 8 5.9 ...
## $ stats_pc_intl_students : int 41 23 25 33 34 38 17 20 23 31 ...
## $ stats_female_share : int 46 44 49 36 39 47 51 50 46 46 ...
datatable(data_raw,
options = list(pageLength = 5))
data_raw[,3:11] %>%
#na.omit() %>%
cor() %>%
corrplot() # six variables correlate, another pair of internationalisation, while female-male isn't related to the rest -- consider taking it out.
data_raw <- data_raw %>%
relocate(stats_pc_intl_students, .after = scores_international_outlook)
data_raw[,3:11] %>%
#na.omit() %>%
cor() %>%
corrplot()
# library(summarytools)
# dfSummary(data)
describe(data_raw)[11:12]
## skew kurtosis
## name* 0.00 -1.20
## location* -0.06 -1.46
## scores_teaching 2.08 5.35
## scores_research 1.95 4.15
## scores_citations 0.26 -1.15
## scores_industry_income 1.84 2.58
## scores_international_outlook 0.58 -0.74
## stats_pc_intl_students 1.81 4.47
## stats_number_students 3.81 24.69
## stats_student_staff_ratio 2.33 10.18
## stats_female_share -0.73 0.75
Some variables are definitely non-normally distributed. How does this affect the results? The answer is not simple.
However, the good news is that PCA requires linearity of relationships between variables but not the normality of their distribution (See this comment https://datascience.stackexchange.com/questions/25789/why-does-pca-assume-gaussian-distribution).
PCA holds no strong distributional assumptions and ‘normality is important only to the extent that skewness or outliers affect the observed correlations’ https://tandfbis.s3.amazonaws.com/rt-media/pdf/9781848729995/IBM_SPSS_5e_Chapter_4.pdf
Read also this short paper to see how to test the robustness of PCs once you have them https://onlinelibrary.wiley.com/doi/epdf/10.1111/evo.13835
There are tests for multivariate normality, if you want to have them, but I do not require those. Here is one example (H1: lack of normality):
library(QuantPsyc)
mult.norm(data_raw[ , 3:11])$mult.test
## Beta-hat kappa p-val
## Skewness 47.2503 11403.07265 0
## Kurtosis 162.1288 85.35902 0
Look how different the variances are:
lapply(data_raw[ , 3:11], var)
## $scores_teaching
## [1] 179.3113
##
## $scores_research
## [1] 293.1243
##
## $scores_citations
## [1] 766.5359
##
## $scores_industry_income
## [1] 275.3449
##
## $scores_international_outlook
## [1] 540.7424
##
## $stats_pc_intl_students
## [1] 144.5496
##
## $stats_number_students
## [1] 424682073
##
## $stats_student_staff_ratio
## [1] 105.9024
##
## $stats_female_share
## [1] 147.2624
Standardize (=scale) only numeric variables
data_scaled <- data_raw
data_scaled[ , 3:11] <- data_raw[ , 3:11] %>%
scale() %>%
as.data.frame()
str(data_scaled)
## 'data.frame': 1448 obs. of 11 variables:
## $ name : chr "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
## $ location : chr "United Kingdom" "United States" "United States" "United States" ...
## $ scores_teaching : num 4.76 4.83 4.99 4.85 4.72 ...
## $ scores_research : num 4.47 4.3 4.42 4.31 4.16 ...
## $ scores_citations : num 1.81 1.88 1.87 1.78 1.88 ...
## $ scores_industry_income : num 1.3969 2.6866 0.0771 2.8433 2.7047 ...
## $ scores_international_outlook: num 2.13 1.4 1.32 1.58 1.85 ...
## $ stats_pc_intl_students : num 2.473 0.976 1.143 1.808 1.891 ...
## $ stats_number_students : num -0.0712 -0.292 -0.0475 -0.9706 -0.5321 ...
## $ stats_student_staff_ratio : num -0.72 -1.08 -0.895 -1.187 -0.983 ...
## $ stats_female_share : num -0.3246 -0.4894 -0.0774 -1.1487 -0.9014 ...
Run PCA
pca_scores <- prcomp(data_scaled[,3:11])
str(pca_scores, give.attr = F)
## List of 5
## $ sdev : num [1:9] 1.867 1.21 1.127 0.948 0.816 ...
## $ rotation: num [1:9, 1:9] -0.438 -0.483 -0.395 -0.278 -0.422 ...
## $ center : Named num [1:9] -7.78e-17 -8.45e-17 -6.91e-17 5.70e-17 7.90e-17 ...
## $ scale : logi FALSE
## $ x : num [1:1448, 1:9] -7.23 -6.65 -6.1 -7.06 -7.09 ...
How much variance do the PCs retain?
pca_scores$sdev ^ 2
## [1] 3.48591462 1.46337701 1.26901618 0.89840475 0.66615817 0.55930826 0.43691022
## [8] 0.14926140 0.07164939
ggbiplot::ggscreeplot(pca_scores)
ggbiplot::ggbiplot(pca_scores, var.axes = T, alpha = 0.1)
There is also another option that prevents variable names from overlapping:
ggplot2::autoplot(
pca_scores,
data = data_scaled,
loadings = TRUE,
loadings.colour = 'brown',
loadings.label.colour = 'brown',
loadings.label = TRUE,
loadings.label.size = 4,
loadings.label.repel = TRUE,
alpha = 0.1
)
To get labels, add the new components to the data and plot them:
score_comps <- cbind(data_scaled, pca_scores$x[, 1:3]) %>%
as.data.frame
bi <- ggplot(score_comps, aes(PC1, PC2, text = name)) +
geom_point(alpha = 0.1) +
theme_bw()
ggplotly(bi, tooltip = "text")
Let’s select a group and map it on PCs:
score_components <- cbind(data_scaled, pca_scores$x[, 1:3]) %>%
as.data.frame
#table(score_components$location)
score_components$NAm <- if_else(score_components$location == "United States" | score_components$location == "Canada", "NAm", "not")
gr <- factor(score_components$NAm)
summary(gr)
## NAm not
## 201 1247
gg1 <- ggplot(score_components, aes(PC1, PC2, text = name)) +
geom_point(aes(color = NAm), alpha = 0.5) +
theme_bw()
ggplotly(gg1, tooltip = "text")
You can add formal tests to quantify the difference and see that the difference in PC1 is statistically significant, but in PC2 not.
t.test(score_components$PC1 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC1 by score_components$NAm
## t = -9.5491, df = 253.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -1.721327 -1.132717
## sample estimates:
## mean in group NAm mean in group not
## -1.228934 0.198088
t.test(score_components$PC2 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC2 by score_components$NAm
## t = -0.766, df = 373.03, p-value = 0.4442
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -0.18572089 0.08158888
## sample estimates:
## mean in group NAm mean in group not
## -0.044838609 0.007227394
t.test(score_components$PC3 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC3 by score_components$NAm
## t = 0.67417, df = 349.81, p-value = 0.5007
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -0.08551853 0.17472418
## sample estimates:
## mean in group NAm mean in group not
## 0.038411407 -0.006191414
delete female-to-male ratio (it can be applied later to differentiate between universities) as it is uncorrelated with other variables
log-transform teaching, research, industry, students, and staff, to reduce skewness
remember to scale everything AFTER log-transforming and BEFORE running PCA (if you log-transform after scaling, you will lose all the data points scoring below the mean values!)
Note that we add a little bit here because log of 0 is not definable:
data_log <- data_raw # copy all raw variables
data_log$log_teaching <- log(data_log$scores_teaching)
data_log$log_research <- log(data_log$scores_research)
data_log$log_industry <- log(data_log$scores_industry_income)
data_log$log_stud <- log(data_log$stats_number_students)
data_log$log_staff <- log(data_log$stats_student_staff_ratio)
data_log$log_intl <- log(data_log$stats_pc_intl_students + 0.1)
You may wonder, why log-transform? There are many correct answers. It is practiced here and there, and there are statistical reasons, too (see Andrew Gelman’s comment ‘Log transform, kids.’ https://statmodeling.stat.columbia.edu/2019/08/21/you-should-usually-log-transform-your-positive-data/). You may also think of these right-skewed variables as having a log-normal distribution which looks normal after this manipulation: https://upload.wikimedia.org/wikipedia/commons/thumb/4/4e/Lognormal_Distribution.svg/660px-Lognormal_Distribution.svg.png
data_short <- dplyr::select(data_log,
name,
location,
log_teaching,
log_research,
scores_citations,
log_industry,
scores_international_outlook,
log_intl,
log_stud,
log_staff)
Now the data are closer to normal distribution:
describe(data_short)[11:12]
## skew kurtosis
## name* 0.00 -1.20
## location* -0.06 -1.46
## log_teaching 0.84 0.38
## log_research 0.57 -0.44
## scores_citations 0.26 -1.15
## log_industry 1.38 0.88
## scores_international_outlook 0.58 -0.74
## log_intl -1.08 0.87
## log_stud -0.63 1.10
## log_staff -0.79 3.89
Note that here, we are using more variables but delete cases with NAs, which is why the sample is reduced and not representative anymore.
It depends on your goals how you would proceed with your own data.
data_short[ , 3:10] <- data_short[ , 3:10] %>%
scale() %>%
as.data.frame()
summary(data_short)
## name location log_teaching log_research
## Length:1448 Length:1448 Min. :-1.9208 Min. :-1.5832
## Class :character Class :character 1st Qu.:-0.7733 1st Qu.:-0.8134
## Mode :character Mode :character Median :-0.1848 Median :-0.1495
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5932 3rd Qu.: 0.6947
## Max. : 3.3084 Max. : 2.7104
## scores_citations log_industry scores_international_outlook
## Min. :-1.6563 Min. :-0.8811 Min. :-1.4391
## 1st Qu.:-0.8888 1st Qu.:-0.7427 1st Qu.:-0.8371
## Median :-0.1068 Median :-0.4021 Median :-0.2028
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8359 3rd Qu.: 0.3757 3rd Qu.: 0.6906
## Max. : 1.8869 Max. : 2.8148 Max. : 2.2807
## log_intl log_stud log_staff
## Min. :-2.5697 Min. :-3.87434 Min. :-5.385033
## 1st Qu.:-0.6028 1st Qu.:-0.57091 1st Qu.:-0.528172
## Median : 0.1842 Median : 0.08956 Median : 0.000173
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 0.7521 3rd Qu.: 0.67778 3rd Qu.: 0.572468
## Max. : 1.7813 Max. : 3.03744 Max. : 3.553679
data_short[,3:10] %>%
cor() %>%
corrplot()
Run PCA again, with a different function:
soc.pca1 <- PCA(data_short[, 3:10], graph = T)
summary.PCA(soc.pca1)
##
## Call:
## PCA(X = data_short[, 3:10], graph = T)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 3.510 1.481 1.020 0.712 0.567 0.362 0.239
## % of var. 43.874 18.515 12.745 8.897 7.087 4.522 2.992
## Cumulative % of var. 43.874 62.389 75.134 84.031 91.118 95.640 98.632
## Dim.8
## Variance 0.109
## % of var. 1.368
## Cumulative % of var. 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 1 | 5.506 | 5.312 0.555 0.931 | -0.659 0.020
## 2 | 5.681 | 5.150 0.522 0.822 | -1.581 0.117
## 3 | 5.071 | 4.515 0.401 0.793 | -0.892 0.037
## 4 | 6.291 | 5.229 0.538 0.691 | -3.268 0.498
## 5 | 5.776 | 5.383 0.570 0.869 | -1.611 0.121
## 6 | 5.244 | 4.945 0.481 0.889 | -0.642 0.019
## 7 | 5.295 | 4.827 0.458 0.831 | 0.404 0.008
## 8 | 5.144 | 4.369 0.376 0.721 | -1.951 0.177
## 9 | 5.133 | 4.594 0.415 0.801 | -1.847 0.159
## 10 | 5.171 | 4.507 0.400 0.760 | -1.826 0.156
## cos2 Dim.3 ctr cos2
## 1 0.014 | 0.622 0.026 0.013 |
## 2 0.077 | 1.552 0.163 0.075 |
## 3 0.031 | 0.313 0.007 0.004 |
## 4 0.270 | 0.908 0.056 0.021 |
## 5 0.078 | 1.146 0.089 0.039 |
## 6 0.015 | 0.058 0.000 0.000 |
## 7 0.006 | 1.961 0.260 0.137 |
## 8 0.144 | 0.725 0.036 0.020 |
## 9 0.129 | 0.395 0.011 0.006 |
## 10 0.125 | 0.437 0.013 0.007 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2
## log_teaching | 0.804 18.416 0.646 | -0.251 4.237 0.063 |
## log_research | 0.921 24.171 0.848 | 0.023 0.035 0.001 |
## scores_citations | 0.742 15.688 0.551 | 0.037 0.094 0.001 |
## log_industry | 0.570 9.270 0.325 | -0.112 0.847 0.013 |
## scores_international_outlook | 0.767 16.758 0.588 | 0.155 1.616 0.024 |
## log_intl | 0.739 15.579 0.547 | 0.066 0.293 0.004 |
## log_stud | 0.064 0.118 0.004 | 0.796 42.811 0.634 |
## log_staff | -0.002 0.000 0.000 | 0.861 50.067 0.742 |
## Dim.3 ctr cos2
## log_teaching 0.294 8.479 0.086 |
## log_research 0.209 4.269 0.044 |
## scores_citations -0.191 3.578 0.036 |
## log_industry 0.635 39.547 0.403 |
## scores_international_outlook -0.478 22.386 0.228 |
## log_intl -0.402 15.876 0.162 |
## log_stud 0.234 5.370 0.055 |
## log_staff 0.071 0.495 0.005 |
Now, why is there a circle? It is called a ‘correlation circle’ of radius 1 and all the loadings of variables should be within it. (For non-standardized variables, the length of the loading arrows approximates the standard deviation of original variables.) Here is the best comment I have been able to find so far: https://stats.stackexchange.com/questions/141085/positioning-the-arrows-on-a-pca-biplot
fviz_screeplot(soc.pca1, addlabels = TRUE)
soc.pca1$var$contrib[ ,1:3]
## Dim.1 Dim.2 Dim.3
## log_teaching 1.841557e+01 4.23718422 8.4785459
## log_research 2.417130e+01 0.03539823 4.2691499
## scores_citations 1.568818e+01 0.09366441 3.5784932
## log_industry 9.270254e+00 0.84700351 39.5468495
## scores_international_outlook 1.675807e+01 1.61593886 22.3857099
## log_intl 1.557860e+01 0.29307125 15.8756093
## log_stud 1.178725e-01 42.81055926 5.3701857
## log_staff 1.566083e-04 50.06718025 0.4954567
2 PCs retain 62% of the information - it is not the best solution
soc.pca1$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 3.5098819 43.873524 43.87352
## comp 2 1.4812078 18.515098 62.38862
## comp 3 1.0196111 12.745138 75.13376
## comp 4 0.7117839 8.897299 84.03106
## comp 5 0.5669757 7.087197 91.11826
## comp 6 0.3617685 4.522107 95.64036
## comp 7 0.2393296 2.991620 98.63198
## comp 8 0.1094414 1.368018 100.00000
Let’s look at a 2-PC solution:
pca_scores1 <- prcomp(data_short[ , 3:10])
score_components1 <- cbind(data_short, pca_scores1$x[, 1:3]) %>%
as.data.frame
gg1 <- ggplot(score_components1, aes(PC1, PC2, text = name)) +
geom_point(alpha = 0.1) +
theme_bw()
ggplotly(gg1, tooltip = "text")
evaluate the quality of PCA and interpret the 1st and 2nd principal components based on the variables you had;
the more variance the first two components retain, the better;
make sense of the PCA solution by looking at specific data points in the new coordinates and telling a story.
sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 1, 0.1), c(0.5, 0.1, 1))
mu<-c(3, 13, 23)
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)
## V1 V2 V3
## 1 3.769898 13.46396 24.18749
## 2 2.198070 11.83659 23.39284
## 3 2.276986 12.14388 22.78743
## 4 4.088129 13.82256 24.06543
## 5 4.390463 14.78453 23.06515
## 6 4.248605 13.78361 24.12097
cov(df)
## V1 V2 V3
## V1 0.9488339 0.86953211 0.46052264
## V2 0.8695321 0.99686253 0.04521317
## V3 0.4605226 0.04521317 1.04003360
cor(df)
## V1 V2 V3
## V1 1.0000000 0.89407280 0.46358801
## V2 0.8940728 1.00000000 0.04440416
## V3 0.4635880 0.04440416 1.00000000
ggpairs(df)
corrplot(sigma, is.corr = T)
Run a PCA with scaling (standardization):
ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.4233 0.9817 0.10262
## Proportion of Variance 0.6753 0.3212 0.00351
## Cumulative Proportion 0.6753 0.9965 1.00000
#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot::ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
Run a PCA withOUT scaling (standardization):
ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.408 0.9965 0.10135
## Proportion of Variance 0.664 0.3326 0.00344
## Cumulative Proportion 0.664 0.9966 1.00000
#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot::ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
Run a PCA withOUT even centering the data:
ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 26.6188 1.18305 0.57500
## Proportion of Variance 0.9976 0.00197 0.00047
## Cumulative Proportion 0.9976 0.99953 1.00000
#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot::ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
library(patchwork)
f1 + f2 + f3
sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 2, 0.1), c(0.5, 0.1, 3))
mu<-c(3, 13, 23)
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)
## V1 V2 V3
## 1 3.944593 13.08659 22.49736
## 2 1.453579 12.92675 21.66832
## 3 4.551205 12.68153 24.12930
## 4 2.693779 11.79429 22.78315
## 5 2.222404 12.72665 20.78659
## 6 1.760597 10.75841 20.06451
cov(df)
## V1 V2 V3
## V1 1.0349156 0.9163631 0.6749478
## V2 0.9163631 2.0549167 0.1845540
## V3 0.6749478 0.1845540 3.2077519
cor(df)
## V1 V2 V3
## V1 1.0000000 0.62837374 0.37043952
## V2 0.6283737 1.00000000 0.07188297
## V3 0.3704395 0.07188297 1.00000000
ggpairs(df)
Run a PCA with scaling (standardization):
ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.3275 0.9681 0.5481
## Proportion of Variance 0.5874 0.3124 0.1001
## Cumulative Proportion 0.5874 0.8999 1.0000
#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot::ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
Run a PCA withOUT scaling (standardization):
ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.8838 1.5302 0.63828
## Proportion of Variance 0.5635 0.3718 0.06469
## Cumulative Proportion 0.5635 0.9353 1.00000
#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot::ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
Run a PCA withOUT even centering the data:
ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 26.6767 1.53166 0.81144
## Proportion of Variance 0.9958 0.00328 0.00092
## Cumulative Proportion 0.9958 0.99908 1.00000
#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot::ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
library(patchwork)
f1 + f2 + f3
A variance-covariance matrix: has variances on the diagonal and covariances in other cells (symmetric) https://www.researchgate.net/publication/237014579/figure/fig1/AS:213963448033280@1428024269392/Variance-covariance-matrix-depicting-homogeneity-of-variance-and-compound-symmetry.png
Computer lab Jan 28 planned: master biplots and replicate this analysis on your data https://www.datacamp.com/tutorial/pca-analysis-r