1- Empirical example
2- PCA on correlated vs uncorrelated variables
3- PCA on correlation matrix (scaled) vs covariation matrix (non-scaled)
library(magrittr)
library(tidyverse)
library(corrplot)
library(psych)
library(plotly)
library(htmlwidgets)
require(FactoMineR)
require(factoextra)
Times Higher Education 2021 ranking data
data <- read.csv("THE2021.csv")
str(data)
## 'data.frame': 1448 obs. of 11 variables:
## $ name : chr "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
## $ location : chr "United Kingdom" "United States" "United States" "United States" ...
## $ scores_teaching : num 91.3 92.2 94.4 92.5 90.7 90.3 85.8 91.9 88.8 88.9 ...
## $ scores_research : num 99.6 96.7 98.8 96.9 94.4 99.2 97.2 93.8 92.5 90.5 ...
## $ scores_citations : num 98 99.9 99.4 97 99.7 95.6 99.1 97.9 98.9 98.6 ...
## $ scores_industry_income : num 68.7 90.1 46.8 92.7 90.4 52.1 84.3 56.1 58 54.9 ...
## $ scores_international_outlook: num 96.4 79.5 77.7 83.6 90 95.7 72.3 68.4 80.2 74 ...
## $ stats_number_students : int 20774 16223 21261 2238 11276 19370 39918 12910 8091 14292 ...
## $ stats_student_staff_ratio : num 11.1 7.4 9.3 6.3 8.4 11 19.8 6 8 5.9 ...
## $ stats_pc_intl_students : int 41 23 25 33 34 38 17 20 23 31 ...
## $ stats_female_share : int 46 44 49 36 39 47 51 50 46 46 ...
data[,3:11] %>%
#na.omit() %>%
cor() %>%
corrplot() # six variables correlate, another pair of internationalisation, while female-male isn't related to the rest -- consider taking it out.
# library(summarytools)
# dfSummary(data)
describe(data)[11:12]
## skew kurtosis
## name* 0.00 -1.20
## location* -0.06 -1.46
## scores_teaching 2.08 5.35
## scores_research 1.95 4.15
## scores_citations 0.26 -1.15
## scores_industry_income 1.84 2.58
## scores_international_outlook 0.58 -0.74
## stats_number_students 3.81 24.69
## stats_student_staff_ratio 2.33 10.18
## stats_pc_intl_students 1.81 4.47
## stats_female_share -0.73 0.75
Look how different the variances are
lapply(data[ , 3:11], var)
## $scores_teaching
## [1] 179.3113
##
## $scores_research
## [1] 293.1243
##
## $scores_citations
## [1] 766.5359
##
## $scores_industry_income
## [1] 275.3449
##
## $scores_international_outlook
## [1] 540.7424
##
## $stats_number_students
## [1] 424682073
##
## $stats_student_staff_ratio
## [1] 105.9024
##
## $stats_pc_intl_students
## [1] 144.5496
##
## $stats_female_share
## [1] 147.2624
Standardize (=scale) only numeric variables
data[ , 3:11] <- data[ , 3:11] %>%
scale() %>%
as.data.frame()
str(data)
## 'data.frame': 1448 obs. of 11 variables:
## $ name : chr "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
## $ location : chr "United Kingdom" "United States" "United States" "United States" ...
## $ scores_teaching : num 4.76 4.83 4.99 4.85 4.72 ...
## $ scores_research : num 4.47 4.3 4.42 4.31 4.16 ...
## $ scores_citations : num 1.81 1.88 1.87 1.78 1.88 ...
## $ scores_industry_income : num 1.3969 2.6866 0.0771 2.8433 2.7047 ...
## $ scores_international_outlook: num 2.13 1.4 1.32 1.58 1.85 ...
## $ stats_number_students : num -0.0712 -0.292 -0.0475 -0.9706 -0.5321 ...
## $ stats_student_staff_ratio : num -0.72 -1.08 -0.895 -1.187 -0.983 ...
## $ stats_pc_intl_students : num 2.473 0.976 1.143 1.808 1.891 ...
## $ stats_female_share : num -0.3246 -0.4894 -0.0774 -1.1487 -0.9014 ...
Run PCA
pca_scores <- prcomp(data[,3:11])
str(pca_scores, give.attr = F)
## List of 5
## $ sdev : num [1:9] 1.867 1.21 1.127 0.948 0.816 ...
## $ rotation: num [1:9, 1:9] -0.438 -0.483 -0.395 -0.278 -0.422 ...
## $ center : Named num [1:9] -7.78e-17 -8.45e-17 -6.91e-17 5.70e-17 7.90e-17 ...
## $ scale : logi FALSE
## $ x : num [1:1448, 1:9] -7.23 -6.65 -6.1 -7.06 -7.09 ...
How much variance do the PCs retain?
pca_scores$sdev ^ 2
## [1] 3.48591462 1.46337701 1.26901618 0.89840475 0.66615817 0.55930826 0.43691022
## [8] 0.14926140 0.07164939
library(ggbiplot)
ggbiplot(pca_scores, var.axes = T, alpha = 0.1)
bi <- ggbiplot(pca_scores, var.axes = T, alpha = 0.1)
ggplotly(bi) # see tooltip argument for labels
Let’s select a group and map it on PCs:
score_components <- cbind(data, pca_scores$x[, 1:3]) %>%
as.data.frame
#table(score_components$location)
score_components$NAm <- if_else(score_components$location == "United States" | score_components$location == "Canada", "NAm", "not")
gr <- factor(score_components$NAm)
summary(gr)
## NAm not
## 201 1247
gg1 <- ggplot(score_components, aes(PC1, PC2)) +
geom_point(aes(color = NAm)) +
#geom_text(label = if_else(score_components$NAm == "NAm", str_trunc(score_components$name, 20), "")) +
theme_bw()
ggplotly(gg1)
Create labels for a region of data points:
gg2 <- ggplot(score_components, aes(PC1, PC2)) +
geom_point(aes(color = NAm)) +
geom_text(label = if_else(score_components$PC1 < -5 & score_components$PC2 > 0, str_trunc(score_components$name, 15), "")) +
theme_bw()
ggplotly(gg2)
You can add formal tests to quantify the difference and see that the difference in PC1 is statistically significant, but in PC2 not.
t.test(score_components$PC1 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC1 by score_components$NAm
## t = -9.5491, df = 253.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -1.721327 -1.132717
## sample estimates:
## mean in group NAm mean in group not
## -1.228934 0.198088
t.test(score_components$PC2 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC2 by score_components$NAm
## t = -0.766, df = 373.03, p-value = 0.4442
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -0.18572089 0.08158888
## sample estimates:
## mean in group NAm mean in group not
## -0.044838609 0.007227394
t.test(score_components$PC3 ~ score_components$NAm)
##
## Welch Two Sample t-test
##
## data: score_components$PC3 by score_components$NAm
## t = 0.67417, df = 349.81, p-value = 0.5007
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
## -0.08551853 0.17472418
## sample estimates:
## mean in group NAm mean in group not
## 0.038411407 -0.006191414
Note that we add a little bit here because log of 0 is not definable:
data$log_teaching <- log(data$scores_teaching)
data$log_research <- log(data$scores_research)
data$log_industry <- log(data$scores_industry_income)
data$log_stud <- log(data$stats_number_students)
data$log_staff <- log(data$stats_student_staff_ratio)
data$log_intl <- log(data$stats_pc_intl_students + 0.1)
data_short <- select(data,
name,
location,
log_teaching,
log_research,
scores_citations,
log_industry,
scores_international_outlook,
log_intl,
log_stud,
log_staff)
Now the data are closer to normal distribution:
describe(data_short)[11:12]
## skew kurtosis
## name* 0.00 -1.20
## location* -0.06 -1.46
## log_teaching -0.74 0.76
## log_research -0.77 0.61
## scores_citations 0.26 -1.15
## log_industry -1.33 1.85
## scores_international_outlook 0.58 -0.74
## log_intl -0.45 -0.55
## log_stud -0.85 1.88
## log_staff -0.76 0.53
Note that here, we are using more variables but delete cases with NAs, which is why the sample is reduced and not representative anymore.
It depends on your goals how you would proceed with your own data.
data_short[ , 3:10] <- data_short[ , 3:10] %>%
scale()
data_short <- as.data.frame(data_short)
summary(data_short)
## name location log_teaching log_research
## Length:1448 Length:1448 Min. :-3.8700 Min. :-4.0292
## Class :character Class :character 1st Qu.:-0.5779 1st Qu.:-0.5943
## Mode :character Mode :character Median : 0.1417 Median : 0.1407
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7067 3rd Qu.: 0.7194
## Max. : 1.7790 Max. : 1.6353
## NA's :935 NA's :945
## scores_citations log_industry scores_international_outlook
## Min. :-1.6563 Min. :-3.8610 Min. :-1.4391
## 1st Qu.:-0.8888 1st Qu.:-0.4940 1st Qu.:-0.8371
## Median :-0.1068 Median : 0.2425 Median :-0.2028
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.8359 3rd Qu.: 0.7562 3rd Qu.: 0.6906
## Max. : 1.8869 Max. : 1.1185 Max. : 2.2807
## NA's :1035
## log_intl log_stud log_staff
## Min. :-2.0290 Min. :-4.8670 Min. :-3.0946
## 1st Qu.:-0.6689 1st Qu.:-0.5325 1st Qu.:-0.5030
## Median : 0.1082 Median : 0.1131 Median : 0.1321
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.7688 3rd Qu.: 0.6399 3rd Qu.: 0.6991
## Max. : 2.1118 Max. : 2.4083 Max. : 2.2159
## NA's :873 NA's :904 NA's :910
data_short <- na.omit(data_short)
Run PCA again, with a different function:
soc.pca1 <- PCA(data_short[, 3:10], graph = T)
summary.PCA(soc.pca1)
##
## Call:
## PCA(X = data_short[, 3:10], graph = T)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 2.694 1.901 1.063 0.909 0.657 0.492 0.156
## % of var. 33.680 23.767 13.283 11.361 8.213 6.156 1.952
## Cumulative % of var. 33.680 57.446 70.730 82.090 90.304 96.460 98.412
## Dim.8
## Variance 0.127
## % of var. 1.588
## Cumulative % of var. 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr
## 7 | 3.407 | 2.568 5.827 0.568 | 1.428 2.554
## 17 | 3.799 | 2.907 7.468 0.586 | -0.160 0.032
## 30 | 2.987 | 2.783 6.843 0.868 | -0.088 0.010
## 31 | 2.131 | 1.041 0.958 0.239 | 1.694 3.595
## 36 | 2.037 | 1.652 2.412 0.658 | -0.077 0.007
## 39 | 2.101 | 1.334 1.573 0.403 | 0.956 1.145
## 43 | 2.085 | 0.903 0.721 0.188 | 1.647 3.395
## 48 | 3.265 | 2.806 6.957 0.738 | -0.677 0.574
## 54 | 2.304 | 1.834 2.972 0.633 | -0.187 0.044
## 57 | 2.538 | 1.850 3.025 0.531 | -0.114 0.016
## cos2 Dim.3 ctr cos2
## 7 0.176 | -0.427 0.408 0.016 |
## 17 0.002 | -1.917 8.236 0.255 |
## 30 0.001 | 0.081 0.015 0.001 |
## 31 0.632 | 0.220 0.108 0.011 |
## 36 0.001 | 0.572 0.733 0.079 |
## 39 0.207 | 0.843 1.591 0.161 |
## 43 0.624 | -0.282 0.178 0.018 |
## 48 0.043 | -0.217 0.106 0.004 |
## 54 0.007 | 0.317 0.226 0.019 |
## 57 0.002 | -0.088 0.017 0.001 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2
## log_teaching | 0.577 12.374 0.333 | 0.708 26.372 0.501 |
## log_research | 0.799 23.706 0.639 | 0.371 7.230 0.137 |
## scores_citations | 0.695 17.910 0.483 | -0.224 2.651 0.050 |
## log_industry | 0.156 0.903 0.024 | 0.628 20.764 0.395 |
## scores_international_outlook | 0.693 17.837 0.481 | -0.635 21.201 0.403 |
## log_intl | 0.582 12.565 0.339 | -0.490 12.611 0.240 |
## log_stud | 0.397 5.853 0.158 | 0.387 7.895 0.150 |
## log_staff | -0.488 8.853 0.239 | 0.156 1.277 0.024 |
## Dim.3 ctr cos2
## log_teaching -0.053 0.267 0.003 |
## log_research 0.234 5.171 0.055 |
## scores_citations -0.279 7.342 0.078 |
## log_industry 0.556 29.140 0.310 |
## scores_international_outlook 0.022 0.046 0.000 |
## log_intl 0.393 14.518 0.154 |
## log_stud -0.678 43.292 0.460 |
## log_staff 0.049 0.225 0.002 |
fviz_screeplot(soc.pca1, addlabels = TRUE)
soc.pca1$var$contrib[ ,1:3]
## Dim.1 Dim.2 Dim.3
## log_teaching 12.3739350 26.371522 0.26697277
## log_research 23.7055882 7.230497 5.17054762
## scores_citations 17.9097879 2.650631 7.34207163
## log_industry 0.9034306 20.763748 29.13971720
## scores_international_outlook 17.8368497 21.200549 0.04590029
## log_intl 12.5645372 12.610796 14.51836055
## log_stud 5.8533203 7.895497 43.29192810
## log_staff 8.8525511 1.276761 0.22450185
2 PCs retain 62% of the information - it is not the best solution
soc.pca1$eig
## eigenvalue percentage of variance cumulative percentage of variance
## comp 1 2.6943816 33.679770 33.67977
## comp 2 1.9013302 23.766628 57.44640
## comp 3 1.0626532 13.283165 70.72956
## comp 4 0.9088552 11.360689 82.09025
## comp 5 0.6570669 8.213336 90.30359
## comp 6 0.4924876 6.156095 96.45968
## comp 7 0.1561454 1.951818 98.41150
## comp 8 0.1270799 1.588499 100.00000
Let’s look at a 2-PC solution:
pca_scores1 <- prcomp(data_short[ , 3:10])
score_components1 <- cbind(data_short, pca_scores1$x[, 1:3]) %>%
as.data.frame
gg1 <- ggplot(score_components1, aes(PC1, PC2, text = name)) +
geom_point() +
theme_bw()
ggplotly(gg1, tooltip = "text")
evaluate the quality of PCA and interpret the 1st and 2nd principal components based on the variables you had;
the more variance the first two components retain, the better;
make sense of the PCA solution by looking at specific data points in the new coordinates and telling a story.
sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 1, 0.1), c(0.5, 0.1, 1))
mu<-c(3, 13, 23)
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)
## V1 V2 V3
## 1 3.769898 13.46396 24.18749
## 2 2.198070 11.83659 23.39284
## 3 2.276986 12.14388 22.78743
## 4 4.088129 13.82256 24.06543
## 5 4.390463 14.78453 23.06515
## 6 4.248605 13.78361 24.12097
cov(df)
## V1 V2 V3
## V1 0.9488339 0.86953211 0.46052264
## V2 0.8695321 0.99686253 0.04521317
## V3 0.4605226 0.04521317 1.04003360
cor(df)
## V1 V2 V3
## V1 1.0000000 0.89407280 0.46358801
## V2 0.8940728 1.00000000 0.04440416
## V3 0.4635880 0.04440416 1.00000000
ggpairs(df)
corrplot(sigma, is.corr = T)
Run a PCA with scaling (standardization):
ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.4233 0.9817 0.10262
## Proportion of Variance 0.6753 0.3212 0.00351
## Cumulative Proportion 0.6753 0.9965 1.00000
#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
Run a PCA withOUT scaling (standardization):
ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.408 0.9965 0.10135
## Proportion of Variance 0.664 0.3326 0.00344
## Cumulative Proportion 0.664 0.9966 1.00000
#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
Run a PCA withOUT even centering the data:
ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 26.6188 1.18305 0.57500
## Proportion of Variance 0.9976 0.00197 0.00047
## Cumulative Proportion 0.9976 0.99953 1.00000
#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
library(patchwork)
f1 + f2 + f3
sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 2, 0.1), c(0.5, 0.1, 3))
mu<-c(3, 13, 23)
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)
## V1 V2 V3
## 1 3.944593 13.08659 22.49736
## 2 1.453579 12.92675 21.66832
## 3 4.551205 12.68153 24.12930
## 4 2.693779 11.79429 22.78315
## 5 2.222404 12.72665 20.78659
## 6 1.760597 10.75841 20.06451
cov(df)
## V1 V2 V3
## V1 1.0349156 0.9163631 0.6749478
## V2 0.9163631 2.0549167 0.1845540
## V3 0.6749478 0.1845540 3.2077519
cor(df)
## V1 V2 V3
## V1 1.0000000 0.62837374 0.37043952
## V2 0.6283737 1.00000000 0.07188297
## V3 0.3704395 0.07188297 1.00000000
ggpairs(df)
Run a PCA with scaling (standardization):
ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.3275 0.9681 0.5481
## Proportion of Variance 0.5874 0.3124 0.1001
## Cumulative Proportion 0.5874 0.8999 1.0000
#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
Run a PCA withOUT scaling (standardization):
ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 1.8838 1.5302 0.63828
## Proportion of Variance 0.5635 0.3718 0.06469
## Cumulative Proportion 0.5635 0.9353 1.00000
#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
Run a PCA withOUT even centering the data:
ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 26.6767 1.53166 0.81144
## Proportion of Variance 0.9958 0.00328 0.00092
## Cumulative Proportion 0.9958 0.99908 1.00000
#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
library(patchwork)
f1 + f2 + f3
A variance-covariance matrix: has variances on the diagonal and covariances in other cells (symmetric) https://www.researchgate.net/publication/237014579/figure/fig1/AS:213963448033280@1428024269392/Variance-covariance-matrix-depicting-homogeneity-of-variance-and-compound-symmetry.png
Computer lab Jan 28 planned: master biplots and replicate this analysis on your data https://www.datacamp.com/tutorial/pca-analysis-r