Plan:

1- Empirical example

2- PCA on correlated vs uncorrelated variables

3- PCA on correlation matrix (scaled) vs covariation matrix (non-scaled)

Empirical example

library(magrittr)
library(tidyverse)
library(corrplot)
library(psych)
library(plotly)
library(htmlwidgets)
require(FactoMineR)
require(factoextra)

Times Higher Education 2021 ranking data

data <- read.csv("THE2021.csv")
str(data)

## 'data.frame':    1448 obs. of  11 variables:
##  $ name                        : chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ location                    : chr  "United Kingdom" "United States" "United States" "United States" ...
##  $ scores_teaching             : num  91.3 92.2 94.4 92.5 90.7 90.3 85.8 91.9 88.8 88.9 ...
##  $ scores_research             : num  99.6 96.7 98.8 96.9 94.4 99.2 97.2 93.8 92.5 90.5 ...
##  $ scores_citations            : num  98 99.9 99.4 97 99.7 95.6 99.1 97.9 98.9 98.6 ...
##  $ scores_industry_income      : num  68.7 90.1 46.8 92.7 90.4 52.1 84.3 56.1 58 54.9 ...
##  $ scores_international_outlook: num  96.4 79.5 77.7 83.6 90 95.7 72.3 68.4 80.2 74 ...
##  $ stats_number_students       : int  20774 16223 21261 2238 11276 19370 39918 12910 8091 14292 ...
##  $ stats_student_staff_ratio   : num  11.1 7.4 9.3 6.3 8.4 11 19.8 6 8 5.9 ...
##  $ stats_pc_intl_students      : int  41 23 25 33 34 38 17 20 23 31 ...
##  $ stats_female_share          : int  46 44 49 36 39 47 51 50 46 46 ...

data[,3:11] %>% 
  #na.omit() %>% 
  cor() %>% 
  corrplot() # six variables correlate, another pair of internationalisation, while female-male isn't related to the rest -- consider taking it out.

# library(summarytools)
# dfSummary(data)
describe(data)[11:12]

##                               skew kurtosis
## name*                         0.00    -1.20
## location*                    -0.06    -1.46
## scores_teaching               2.08     5.35
## scores_research               1.95     4.15
## scores_citations              0.26    -1.15
## scores_industry_income        1.84     2.58
## scores_international_outlook  0.58    -0.74
## stats_number_students         3.81    24.69
## stats_student_staff_ratio     2.33    10.18
## stats_pc_intl_students        1.81     4.47
## stats_female_share           -0.73     0.75

Look how different the variances are

lapply(data[ , 3:11], var)

## $scores_teaching
## [1] 179.3113
## 
## $scores_research
## [1] 293.1243
## 
## $scores_citations
## [1] 766.5359
## 
## $scores_industry_income
## [1] 275.3449
## 
## $scores_international_outlook
## [1] 540.7424
## 
## $stats_number_students
## [1] 424682073
## 
## $stats_student_staff_ratio
## [1] 105.9024
## 
## $stats_pc_intl_students
## [1] 144.5496
## 
## $stats_female_share
## [1] 147.2624

Standardize (=scale) only numeric variables

data[ , 3:11] <- data[ , 3:11] %>%
  scale() %>% 
  as.data.frame()
str(data)

## 'data.frame':    1448 obs. of  11 variables:
##  $ name                        : chr  "University of Oxford" "Stanford University" "Harvard University" "California Institute of Technology" ...
##  $ location                    : chr  "United Kingdom" "United States" "United States" "United States" ...
##  $ scores_teaching             : num  4.76 4.83 4.99 4.85 4.72 ...
##  $ scores_research             : num  4.47 4.3 4.42 4.31 4.16 ...
##  $ scores_citations            : num  1.81 1.88 1.87 1.78 1.88 ...
##  $ scores_industry_income      : num  1.3969 2.6866 0.0771 2.8433 2.7047 ...
##  $ scores_international_outlook: num  2.13 1.4 1.32 1.58 1.85 ...
##  $ stats_number_students       : num  -0.0712 -0.292 -0.0475 -0.9706 -0.5321 ...
##  $ stats_student_staff_ratio   : num  -0.72 -1.08 -0.895 -1.187 -0.983 ...
##  $ stats_pc_intl_students      : num  2.473 0.976 1.143 1.808 1.891 ...
##  $ stats_female_share          : num  -0.3246 -0.4894 -0.0774 -1.1487 -0.9014 ...

Run PCA

pca_scores <- prcomp(data[,3:11])
str(pca_scores, give.attr = F)

## List of 5
##  $ sdev    : num [1:9] 1.867 1.21 1.127 0.948 0.816 ...
##  $ rotation: num [1:9, 1:9] -0.438 -0.483 -0.395 -0.278 -0.422 ...
##  $ center  : Named num [1:9] -7.78e-17 -8.45e-17 -6.91e-17 5.70e-17 7.90e-17 ...
##  $ scale   : logi FALSE
##  $ x       : num [1:1448, 1:9] -7.23 -6.65 -6.1 -7.06 -7.09 ...

How much variance do the PCs retain?

pca_scores$sdev ^ 2

## [1] 3.48591462 1.46337701 1.26901618 0.89840475 0.66615817 0.55930826 0.43691022
## [8] 0.14926140 0.07164939

library(ggbiplot)
ggbiplot(pca_scores, var.axes = T, alpha = 0.1)

bi <- ggbiplot(pca_scores, var.axes = T, alpha = 0.1)
ggplotly(bi) # see tooltip argument for labels

Let’s select a group and map it on PCs:

score_components <- cbind(data, pca_scores$x[, 1:3]) %>% 
  as.data.frame
#table(score_components$location)
score_components$NAm <- if_else(score_components$location == "United States" | score_components$location == "Canada", "NAm", "not")

gr <- factor(score_components$NAm)
summary(gr)

##  NAm  not 
##  201 1247

gg1 <- ggplot(score_components, aes(PC1, PC2)) +
  geom_point(aes(color = NAm)) + 
  #geom_text(label = if_else(score_components$NAm == "NAm", str_trunc(score_components$name, 20), "")) +
  theme_bw()
ggplotly(gg1)

Create labels for a region of data points:

gg2 <- ggplot(score_components, aes(PC1, PC2)) +
  geom_point(aes(color = NAm)) + 
  geom_text(label = if_else(score_components$PC1 < -5 & score_components$PC2 > 0, str_trunc(score_components$name, 15), "")) +
  theme_bw()
ggplotly(gg2)

You can add formal tests to quantify the difference and see that the difference in PC1 is statistically significant, but in PC2 not.

t.test(score_components$PC1 ~ score_components$NAm)

## 
##  Welch Two Sample t-test
## 
## data:  score_components$PC1 by score_components$NAm
## t = -9.5491, df = 253.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
##  -1.721327 -1.132717
## sample estimates:
## mean in group NAm mean in group not 
##         -1.228934          0.198088

t.test(score_components$PC2 ~ score_components$NAm)

## 
##  Welch Two Sample t-test
## 
## data:  score_components$PC2 by score_components$NAm
## t = -0.766, df = 373.03, p-value = 0.4442
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
##  -0.18572089  0.08158888
## sample estimates:
## mean in group NAm mean in group not 
##      -0.044838609       0.007227394

t.test(score_components$PC3 ~ score_components$NAm)

## 
##  Welch Two Sample t-test
## 
## data:  score_components$PC3 by score_components$NAm
## t = 0.67417, df = 349.81, p-value = 0.5007
## alternative hypothesis: true difference in means between group NAm and group not is not equal to 0
## 95 percent confidence interval:
##  -0.08551853  0.17472418
## sample estimates:
## mean in group NAm mean in group not 
##       0.038411407      -0.006191414

Let’s have another round of data preparation and PCA:

delete female-to-male ratio (it can be applied later to differentiate between universities) as it is uncorrelated with other variables
log-transform teaching, research, industry, students, and staff, to reduce skewness
remember to scale everything before running PCA due to differences in scales

Note that we add a little bit here because log of 0 is not definable:

data$log_teaching <- log(data$scores_teaching)
data$log_research <- log(data$scores_research)
data$log_industry <- log(data$scores_industry_income)
data$log_stud <- log(data$stats_number_students)
data$log_staff <- log(data$stats_student_staff_ratio)
data$log_intl <- log(data$stats_pc_intl_students + 0.1)

data_short <- select(data, 
                     name, 
                     location, 
                     log_teaching, 
                     log_research,
                     scores_citations,
                     log_industry,
                     scores_international_outlook,
                     log_intl, 
                     log_stud, 
                     log_staff)

Now the data are closer to normal distribution:

describe(data_short)[11:12]

##                               skew kurtosis
## name*                         0.00    -1.20
## location*                    -0.06    -1.46
## log_teaching                 -0.74     0.76
## log_research                 -0.77     0.61
## scores_citations              0.26    -1.15
## log_industry                 -1.33     1.85
## scores_international_outlook  0.58    -0.74
## log_intl                     -0.45    -0.55
## log_stud                     -0.85     1.88
## log_staff                    -0.76     0.53

Note that here, we are using more variables but delete cases with NAs, which is why the sample is reduced and not representative anymore.

It depends on your goals how you would proceed with your own data.

data_short[ , 3:10] <- data_short[ , 3:10] %>%
  scale()
data_short <- as.data.frame(data_short)
summary(data_short)

##      name             location          log_teaching      log_research    
##  Length:1448        Length:1448        Min.   :-3.8700   Min.   :-4.0292  
##  Class :character   Class :character   1st Qu.:-0.5779   1st Qu.:-0.5943  
##  Mode  :character   Mode  :character   Median : 0.1417   Median : 0.1407  
##                                        Mean   : 0.0000   Mean   : 0.0000  
##                                        3rd Qu.: 0.7067   3rd Qu.: 0.7194  
##                                        Max.   : 1.7790   Max.   : 1.6353  
##                                        NA's   :935       NA's   :945      
##  scores_citations   log_industry     scores_international_outlook
##  Min.   :-1.6563   Min.   :-3.8610   Min.   :-1.4391             
##  1st Qu.:-0.8888   1st Qu.:-0.4940   1st Qu.:-0.8371             
##  Median :-0.1068   Median : 0.2425   Median :-0.2028             
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000             
##  3rd Qu.: 0.8359   3rd Qu.: 0.7562   3rd Qu.: 0.6906             
##  Max.   : 1.8869   Max.   : 1.1185   Max.   : 2.2807             
##                    NA's   :1035                                  
##     log_intl          log_stud         log_staff      
##  Min.   :-2.0290   Min.   :-4.8670   Min.   :-3.0946  
##  1st Qu.:-0.6689   1st Qu.:-0.5325   1st Qu.:-0.5030  
##  Median : 0.1082   Median : 0.1131   Median : 0.1321  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.7688   3rd Qu.: 0.6399   3rd Qu.: 0.6991  
##  Max.   : 2.1118   Max.   : 2.4083   Max.   : 2.2159  
##  NA's   :873       NA's   :904       NA's   :910

data_short <- na.omit(data_short)

Run PCA again, with a different function:

soc.pca1 <- PCA(data_short[, 3:10], graph = T)

summary.PCA(soc.pca1)

## 
## Call:
## PCA(X = data_short[, 3:10], graph = T) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6   Dim.7
## Variance               2.694   1.901   1.063   0.909   0.657   0.492   0.156
## % of var.             33.680  23.767  13.283  11.361   8.213   6.156   1.952
## Cumulative % of var.  33.680  57.446  70.730  82.090  90.304  96.460  98.412
##                        Dim.8
## Variance               0.127
## % of var.              1.588
## Cumulative % of var. 100.000
## 
## Individuals (the 10 first)
##                                  Dist    Dim.1    ctr   cos2    Dim.2    ctr
## 7                            |  3.407 |  2.568  5.827  0.568 |  1.428  2.554
## 17                           |  3.799 |  2.907  7.468  0.586 | -0.160  0.032
## 30                           |  2.987 |  2.783  6.843  0.868 | -0.088  0.010
## 31                           |  2.131 |  1.041  0.958  0.239 |  1.694  3.595
## 36                           |  2.037 |  1.652  2.412  0.658 | -0.077  0.007
## 39                           |  2.101 |  1.334  1.573  0.403 |  0.956  1.145
## 43                           |  2.085 |  0.903  0.721  0.188 |  1.647  3.395
## 48                           |  3.265 |  2.806  6.957  0.738 | -0.677  0.574
## 54                           |  2.304 |  1.834  2.972  0.633 | -0.187  0.044
## 57                           |  2.538 |  1.850  3.025  0.531 | -0.114  0.016
##                                cos2    Dim.3    ctr   cos2  
## 7                             0.176 | -0.427  0.408  0.016 |
## 17                            0.002 | -1.917  8.236  0.255 |
## 30                            0.001 |  0.081  0.015  0.001 |
## 31                            0.632 |  0.220  0.108  0.011 |
## 36                            0.001 |  0.572  0.733  0.079 |
## 39                            0.207 |  0.843  1.591  0.161 |
## 43                            0.624 | -0.282  0.178  0.018 |
## 48                            0.043 | -0.217  0.106  0.004 |
## 54                            0.007 |  0.317  0.226  0.019 |
## 57                            0.002 | -0.088  0.017  0.001 |
## 
## Variables
##                                 Dim.1    ctr   cos2    Dim.2    ctr   cos2  
## log_teaching                 |  0.577 12.374  0.333 |  0.708 26.372  0.501 |
## log_research                 |  0.799 23.706  0.639 |  0.371  7.230  0.137 |
## scores_citations             |  0.695 17.910  0.483 | -0.224  2.651  0.050 |
## log_industry                 |  0.156  0.903  0.024 |  0.628 20.764  0.395 |
## scores_international_outlook |  0.693 17.837  0.481 | -0.635 21.201  0.403 |
## log_intl                     |  0.582 12.565  0.339 | -0.490 12.611  0.240 |
## log_stud                     |  0.397  5.853  0.158 |  0.387  7.895  0.150 |
## log_staff                    | -0.488  8.853  0.239 |  0.156  1.277  0.024 |
##                               Dim.3    ctr   cos2  
## log_teaching                 -0.053  0.267  0.003 |
## log_research                  0.234  5.171  0.055 |
## scores_citations             -0.279  7.342  0.078 |
## log_industry                  0.556 29.140  0.310 |
## scores_international_outlook  0.022  0.046  0.000 |
## log_intl                      0.393 14.518  0.154 |
## log_stud                     -0.678 43.292  0.460 |
## log_staff                     0.049  0.225  0.002 |

fviz_screeplot(soc.pca1, addlabels = TRUE)

soc.pca1$var$contrib[ ,1:3]

##                                   Dim.1     Dim.2       Dim.3
## log_teaching                 12.3739350 26.371522  0.26697277
## log_research                 23.7055882  7.230497  5.17054762
## scores_citations             17.9097879  2.650631  7.34207163
## log_industry                  0.9034306 20.763748 29.13971720
## scores_international_outlook 17.8368497 21.200549  0.04590029
## log_intl                     12.5645372 12.610796 14.51836055
## log_stud                      5.8533203  7.895497 43.29192810
## log_staff                     8.8525511  1.276761  0.22450185

2 PCs retain 62% of the information - it is not the best solution

soc.pca1$eig

##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1  2.6943816              33.679770                          33.67977
## comp 2  1.9013302              23.766628                          57.44640
## comp 3  1.0626532              13.283165                          70.72956
## comp 4  0.9088552              11.360689                          82.09025
## comp 5  0.6570669               8.213336                          90.30359
## comp 6  0.4924876               6.156095                          96.45968
## comp 7  0.1561454               1.951818                          98.41150
## comp 8  0.1270799               1.588499                         100.00000

Let’s look at a 2-PC solution:

pca_scores1 <- prcomp(data_short[ , 3:10])
score_components1 <- cbind(data_short, pca_scores1$x[, 1:3]) %>% 
  as.data.frame
gg1 <- ggplot(score_components1, aes(PC1, PC2, text = name)) +
  geom_point() +
  theme_bw()
ggplotly(gg1, tooltip = "text")

Takeaways:

evaluate the quality of PCA and interpret the 1st and 2nd principal components based on the variables you had;
the more variance the first two components retain, the better;
make sense of the PCA solution by looking at specific data points in the new coordinates and telling a story.

PCA on correlated vs uncorrelated variables

1.Generate correlated variables:
https://www.r-bloggers.com/2021/05/how-to-generate-correlated-data-in-r/

set.seed(5)

library(MASS)
library(tidyverse)
library(GGally)
# create the variance covariance matrix
sigma<-rbind(c(1, 0.8, 0.7), c(0.8, 1, 0.5), c(0.7, 0.5, 1))
# create the mean vector
mu<-c(3, 3, 3) 
# generate the multivariate normal distribution
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
ggpairs(df)

head(df)

##         V1       V2       V3
## 1 2.783293 2.575782 1.306218
## 2 4.018948 3.802711 4.912126
## 3 1.872049 2.071065 1.715144
## 4 2.807426 3.239220 3.168219
## 5 4.214603 4.721166 4.641462
## 6 1.976361 2.990800 2.475940

# suitable for a correlation matrix
library(corrplot)
corrplot(sigma, is.corr = T)

Run PCA on them, note the shared variance

ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)

## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.5334 0.7005 0.39746
## Proportion of Variance 0.7838 0.1636 0.05266
## Cumulative Proportion  0.7838 0.9473 1.00000

#install.packages("devtools")
#library(devtools)
#install_github("vqv/ggbiplot")
library(ggbiplot)
ggbiplot(ex.pca, var.axes = T, alpha = 0.1)

Generate uncorrelated data and run PCA on them:

# create the variance covariance matrix
sigma<-rbind(c(1, 0.01, -0.01), c(0.01, 1, 0.01), c(-0.01, 0.01, 1))
# create the mean vector
mu<-c(3, 3, 3) 
# generate the multivariate normal distribution
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
ggpairs(df)

head(df)

##         V1       V2       V3
## 1 2.711837 4.117121 2.557052
## 2 2.948858 3.226435 3.344459
## 3 3.005668 3.425484 1.416396
## 4 3.744557 3.766854 1.502967
## 5 2.307797 3.510017 2.293501
## 6 4.055856 3.680454 3.981674

ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)

## Importance of components:
##                          PC1    PC2    PC3
## Standard deviation     1.007 0.9997 0.9932
## Proportion of Variance 0.338 0.3331 0.3288
## Cumulative Proportion  0.338 0.6712 1.0000

ggbiplot(ex.pca, var.axes = T, alpha = 0.1)

Takeaways:

the minimum shared variance in the first 2 PCs is k (variables) * 2/k
correlated data produce PCs with eigenvalues larger than 1; uncorrelated data give PCs with eigenvalues close to 1.

PCA on correlation matrix (scaled) vs covariation matrix (non-scaled)

Generate correlated data with similar scales

sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 1, 0.1), c(0.5, 0.1, 1))
mu<-c(3, 13, 23) 
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)

##         V1       V2       V3
## 1 3.769898 13.46396 24.18749
## 2 2.198070 11.83659 23.39284
## 3 2.276986 12.14388 22.78743
## 4 4.088129 13.82256 24.06543
## 5 4.390463 14.78453 23.06515
## 6 4.248605 13.78361 24.12097

cov(df)

##           V1         V2         V3
## V1 0.9488339 0.86953211 0.46052264
## V2 0.8695321 0.99686253 0.04521317
## V3 0.4605226 0.04521317 1.04003360

cor(df)

##           V1         V2         V3
## V1 1.0000000 0.89407280 0.46358801
## V2 0.8940728 1.00000000 0.04440416
## V3 0.4635880 0.04440416 1.00000000

ggpairs(df)

corrplot(sigma, is.corr = T)

Run a PCA with scaling (standardization):

ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)

## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.4233 0.9817 0.10262
## Proportion of Variance 0.6753 0.3212 0.00351
## Cumulative Proportion  0.6753 0.9965 1.00000

#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot(ex.pca, var.axes = T, alpha = 0.1)

Run a PCA withOUT scaling (standardization):

ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)

## Importance of components:
##                          PC1    PC2     PC3
## Standard deviation     1.408 0.9965 0.10135
## Proportion of Variance 0.664 0.3326 0.00344
## Cumulative Proportion  0.664 0.9966 1.00000

#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)

Run a PCA withOUT even centering the data:

ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)

## Importance of components:
##                            PC1     PC2     PC3
## Standard deviation     26.6188 1.18305 0.57500
## Proportion of Variance  0.9976 0.00197 0.00047
## Cumulative Proportion   0.9976 0.99953 1.00000

#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)

library(patchwork)
f1 + f2 + f3

Now generate data with the same mean and similar but unequal variances:

sigma<-rbind(c(1, 0.9, 0.5), c(0.9, 2, 0.1), c(0.5, 0.1, 3))
mu<-c(3, 13, 23) 
df<-as.data.frame(mvrnorm(n=1000, mu=mu, Sigma=sigma))
head(df)

##         V1       V2       V3
## 1 3.944593 13.08659 22.49736
## 2 1.453579 12.92675 21.66832
## 3 4.551205 12.68153 24.12930
## 4 2.693779 11.79429 22.78315
## 5 2.222404 12.72665 20.78659
## 6 1.760597 10.75841 20.06451

cov(df)

##           V1        V2        V3
## V1 1.0349156 0.9163631 0.6749478
## V2 0.9163631 2.0549167 0.1845540
## V3 0.6749478 0.1845540 3.2077519

cor(df)

##           V1         V2         V3
## V1 1.0000000 0.62837374 0.37043952
## V2 0.6283737 1.00000000 0.07188297
## V3 0.3704395 0.07188297 1.00000000

ggpairs(df)

Run a PCA with scaling (standardization):

ex.pca <- prcomp(df, center = TRUE,scale = TRUE)
summary(ex.pca)

## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.3275 0.9681 0.5481
## Proportion of Variance 0.5874 0.3124 0.1001
## Cumulative Proportion  0.5874 0.8999 1.0000

#ggbiplot(ex.pca, var.axes = T, alpha = 0.1)
f1 <- ggbiplot(ex.pca, var.axes = T, alpha = 0.1)

Run a PCA withOUT scaling (standardization):

ex.pca2 <- prcomp(df, center = TRUE,scale = FALSE)
summary(ex.pca2)

## Importance of components:
##                           PC1    PC2     PC3
## Standard deviation     1.8838 1.5302 0.63828
## Proportion of Variance 0.5635 0.3718 0.06469
## Cumulative Proportion  0.5635 0.9353 1.00000

#ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)
f2 <- ggbiplot(ex.pca2, var.axes = T, alpha = 0.1)

Run a PCA withOUT even centering the data:

ex.pca3 <- prcomp(df, center = FALSE,scale = FALSE)
summary(ex.pca3)

## Importance of components:
##                            PC1     PC2     PC3
## Standard deviation     26.6767 1.53166 0.81144
## Proportion of Variance  0.9958 0.00328 0.00092
## Cumulative Proportion   0.9958 0.99908 1.00000

#ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)
f3 <- ggbiplot(ex.pca3, var.axes = T, alpha = 0.1)

library(patchwork)
f1 + f2 + f3

Takeaway:

to give more variance to PC1, standardize solutions;
to level out variables’ impact on principal components, scale them.

Appendix

A variance-covariance matrix: has variances on the diagonal and covariances in other cells (symmetric) https://www.researchgate.net/publication/237014579/figure/fig1/AS:213963448033280@1428024269392/Variance-covariance-matrix-depicting-homogeneity-of-variance-and-compound-symmetry.png

Computer lab Jan 28 planned: master biplots and replicate this analysis on your data https://www.datacamp.com/tutorial/pca-analysis-r

PCA

Anna Shirokanova

2023-01-20

Plan:

Empirical example

Let’s have another round of data preparation and PCA:

Takeaways:

PCA on correlated vs uncorrelated variables

Takeaways:

PCA on correlation matrix (scaled) vs covariation matrix (non-scaled)

Takeaway:

Appendix