library(rmarkdown); library(knitr); library(readxl)
set.seed(37)
The intrinsic dimensionality is 2
DCF <- matrix(c(1.00, -.72, -.09, -.38, -.72, 1.00, 0.23, 0.49, -.09, 0.23, 1.0, -.46, -.38, 0.49, -.46, 1.00), nrow = 4, ncol = 4)
rownames(DCF)<- c("DBH", "TYPE", "SLOPE", "ASPECT")
colnames(DCF)<- c("DBH", "TYPE", "SLOPE", "ASPECT")
DCF
## DBH TYPE SLOPE ASPECT
## DBH 1.00 -0.72 -0.09 -0.38
## TYPE -0.72 1.00 0.23 0.49
## SLOPE -0.09 0.23 1.00 -0.46
## ASPECT -0.38 0.49 -0.46 1.00
eigen(DCF)$values
## [1] 2.0746888 1.3654830 0.3926504 0.1671779
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
Arot <- pca(DCF, 2, rotate = "varimax")$loadings[]
Arot
## RC1 RC2
## DBH -0.8854395 0.02236984
## TYPE 0.9433740 0.02898144
## SLOPE 0.2220135 0.92760753
## ASPECT 0.5426109 -0.74879998
communality <- Arot[1:4,1]^2 + Arot[1:4,2]^2
communality
## DBH TYPE SLOPE ASPECT
## 0.7845036 0.8907945 0.9097457 0.8551280
We lost 21.5% of our data in DBH from dimensionality reduction.
1 - communality[1]
## DBH
## 0.2154964
variance <- (sum(Arot[,1]^2) + sum(Arot[,2]^2))/4
variance
## [1] 0.8600429
We lost about 14% of the total variance due to dimensionality reduction. This is in our 10-20% sweet spot, so we’re not too concerned about this amount of loss.
1-variance
## [1] 0.1399571
stonks <- matrix(c(0.00023, 0.00038, 0.00022, 0.00007, 0.00006,
0.00038, 0.00134, 0.00041, 0.00013, 0.00008,
0.00022, 0.00041, 0.00093, 0.00019, 0.00002,
0.00007, 0.00013, 0.00019, 0.00068, 0.00037,
0.00006, 0.00008, 0.00002, 0.00037, 0.00053), nrow = 5,ncol = 5)
rownames(stonks)<- c("SNDL", "WRN", "NGD", "UPH", "WISH")
colnames(stonks)<- c("SNDL", "WRN", "NGD", "UPH", "WISH")
stonks*(365-1)
## SNDL WRN NGD UPH WISH
## SNDL 0.08372 0.13832 0.08008 0.02548 0.02184
## WRN 0.13832 0.48776 0.14924 0.04732 0.02912
## NGD 0.08008 0.14924 0.33852 0.06916 0.00728
## UPH 0.02548 0.04732 0.06916 0.24752 0.13468
## WISH 0.02184 0.02912 0.00728 0.13468 0.19292
corStonks <- solve(sqrt(diag(diag(stonks)))) %*% stonks %*%
t(solve(sqrt(diag(diag(stonks)))))
corStonks
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.0000000 0.68449027 0.47568263 0.1770026 0.17184995
## [2,] 0.6844903 1.00000000 0.36727383 0.1361873 0.09492916
## [3,] 0.4756826 0.36727383 1.00000000 0.2389228 0.02848725
## [4,] 0.1770026 0.13618726 0.23892284 1.0000000 0.61632436
## [5,] 0.1718499 0.09492916 0.02848725 0.6163244 1.00000000
The intrinsic dimensionality is 2
eigen(corStonks)$values
## [1] 2.2315142 1.4234986 0.7052701 0.3628567 0.2768604
Astonks <- pca(corStonks, 2, rotate = "varimax")$loadings[]
Astonks
## RC1 RC2
## [1,] 0.88210343 0.12135148
## [2,] 0.84911037 0.04179123
## [3,] 0.71256882 0.09587557
## [4,] 0.13848872 0.88749773
## [5,] 0.01981366 0.90292570
varStonks <- (sum(Astonks[,1]^2) + sum(Astonks[,2]^2))/5
varStonks
## [1] 0.7310026
The intrinsic dimensionality with Joliffe is 3.
eigen(corStonks)$values
## [1] 2.2315142 1.4234986 0.7052701 0.3628567 0.2768604
AJstonks <- pca(corStonks, 3, rotate = "varimax")$loadings[]
AJstonks
## RC1 RC2 RC3
## [1,] 0.86568160 0.11504070 0.2703146
## [2,] 0.92065922 0.04156458 0.1084670
## [3,] 0.27722271 0.06267000 0.9407033
## [4,] 0.02417604 0.87871217 0.2594152
## [5,] 0.11690348 0.90890751 -0.1340521
varJStonks <- (sum(AJstonks[,1]^2) + sum(AJstonks[,2]^2) + sum(AJstonks[,3]^2))/5
varJStonks
## [1] 0.8720566
Yes, we think this extra factor is warranted because it retains an additional 14% of our original variance in our data. We were in the danger zone previously with 27% of variance loss from our original data, but now we only have 13% variance loss. We do have some mild concerns, because we're getting closer to having some complexity, but we haven't crossed that .3 threshold yet.
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.0.5
data(penguins)
pablo <- as.matrix(penguins[,which(sapply(penguins, is.numeric))])
pablo <- pablo[,1:4]
head(pablo)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## [1,] 39.1 18.7 181 3750
## [2,] 39.5 17.4 186 3800
## [3,] 40.3 18.0 195 3250
## [4,] NA NA NA NA
## [5,] 36.7 19.3 193 3450
## [6,] 39.3 20.6 190 3650
The intrinsic dimensionality is 2! According to Jollife.
corPab <- cor(pablo, use = "pairwise.complete.obs")
eigen(corPab)$values
## [1] 2.7537551 0.7725168 0.3652359 0.1084922
AJpab <- pca(corPab, 2, rotate = "varimax")$loadings[]
AJpab
## RC1 RC2
## bill_length_mm 0.9198256 -0.002280866
## bill_depth_mm -0.1479766 -0.954513144
## flipper_length_mm 0.7875472 0.541706148
## body_mass_g 0.7907894 0.456257741
betaPab <- solve(corPab) %*% AJpab
betaPab
## RC1 RC2
## bill_length_mm 0.6118335 -0.4029028
## bill_depth_mm 0.3174765 -0.8838835
## flipper_length_mm 0.2870522 0.1951785
## body_mass_g 0.3264230 0.1088709
standardPablo <- scale(pablo)
factorScores <- standardPablo %*% betaPab
head(factorScores)
## RC1 RC2
## [1,] -0.8818008 -0.67513877
## [2,] -0.9235477 -0.04661149
## [3,] -0.7775815 -0.32394419
## [4,] NA NA
## [5,] -0.9314458 -0.64074029
## [6,] -0.4109149 -1.42896013
povertyData <- matrix(c(6.6, .2, .3, .5, .2, 7.1, .5, 9.9, .7, 0.0, 0.0, 0.0, 0.0, 0.0, 39.8, 0.0, 0.0, 0.2,5.3, 0.0, 0.9, 0.3, 0.2, 45.4, 0.5, 10.6, 1.4, 0.0, 0.8, 1.9, 1.7, 3.4, 30.1, 1.4, 0.2, 0.5), nrow = 9, ncol = 4)
rownames(povertyData) <- c("Estonia", "Luxembourg", "Belgium", "Greece", "Spain", "Djibouti", "Cyprus", "Lithuania", "Kosovo")
colnames(povertyData) <- c("Water", "Electricity", "Sanitation", "Education")
povertyData
## Water Electricity Sanitation Education
## Estonia 6.6 0.0 5.3 0.0
## Luxembourg 0.2 0.0 0.0 0.8
## Belgium 0.3 0.0 0.9 1.9
## Greece 0.5 0.0 0.3 1.7
## Spain 0.2 0.0 0.2 3.4
## Djibouti 7.1 39.8 45.4 30.1
## Cyprus 0.5 0.0 0.5 1.4
## Lithuania 9.9 0.0 10.6 0.2
## Kosovo 0.7 0.2 1.4 0.5
corPov <- cor(povertyData)
Apov <- pca(corPov, 1, rotate = "varimax")$loadings[]
Apov
## PC1
## Water 0.6035805
## Electricity 0.9742289
## Sanitation 0.9993189
## Education 0.9561024
betaPov <- solve(corPov) %*% Apov
standardPov <- scale(povertyData)
factorScoresPov <- standardPov %*% betaPov
factorScoresPov
## PC1
## Estonia -0.09600203
## Luxembourg -0.49451173
## Belgium -0.43705748
## Greece -0.44603880
## Spain -0.41068959
## Djibouti 2.59858223
## Cyprus -0.45102673
## Lithuania 0.18213901
## Kosovo -0.44539489
Djibouti has a factor score of 2.6. This means that Djibouti is 2.6 standard deviations above the the mean level of poverty found in the cluster. So pretty much, Djibouti is very impoverished.
The factor is meant to be a measure of poverty, as determing by the World Bank Group. Each number in the original data is what percentage of the country DOESN'T have access to the corresponding dimension. Therefore, a factor above average means you are more impoverished, which is bad :( It would be better to have a below average factor score.
library(cats)
here_kitty()
## meow