library(rmarkdown); library(knitr); library(readxl)
set.seed(37)
library(readxl)
DistressData <- read_excel("C:/Users/Sarah Chock/OneDrive - University of St. Thomas/Senior Year/STAT 360 Comp Stat and Data Analysis/Exploratory Data Analysis/DistressData.xlsx")
dd <- as.matrix(DistressData)
dd[which(dd==3)] = 6
dd[which(dd==5)] = 3
dd[which(dd==6)] = 5
R <- cor(dd)
library(corrplot)
## corrplot 0.92 loaded
corrplot(R, method = "color")
I see 2 clusters. Cluster 1 includes hopelessness through anxiety, and cluster 2 containg the last 4.
val <- eigen(R)$values
vec <- eigen(R)$vectors
val
## [1] 4.8390360 1.6895478 0.8790759 0.4963882 0.4727502 0.4058604 0.3471259
## [8] 0.3220688 0.2873694 0.2607774
Based on Kaiser's criterion, the intrinsic dimensionality is 2. It's a good thing I took Connor's advice that it wasn't 4!
Lreduced <- diag(val[1:2])
Vreduced <- vec[,(1:2)]
Lreduced
## [,1] [,2]
## [1,] 4.839036 0.000000
## [2,] 0.000000 1.689548
Vreduced
## [,1] [,2]
## [1,] -0.3706128 -0.05171624
## [2,] -0.2688942 -0.36491442
## [3,] -0.2893555 -0.33092172
## [4,] -0.3580225 -0.12874087
## [5,] -0.3760024 -0.13452298
## [6,] -0.3803139 0.04534032
## [7,] -0.3439345 -0.13731172
## [8,] -0.2450188 0.45841190
## [9,] -0.2813070 0.45591701
## [10,] -0.1869765 0.53016132
loading <- Vreduced%*%sqrt(Lreduced)
loading
## [,1] [,2]
## [1,] -0.8152669 -0.06722211
## [2,] -0.5915083 -0.47432527
## [3,] -0.6365186 -0.43014068
## [4,] -0.7875711 -0.16734074
## [5,] -0.8271228 -0.17485648
## [6,] -0.8366072 0.05893453
## [7,] -0.7565806 -0.17848135
## [8,] -0.5389876 0.59585573
## [9,] -0.6188137 0.59261280
## [10,] -0.4113073 0.68911750
Yes, we have quite a bit complexity. There is complexity in the following rows: 2, 3, 8, 9, and 10. These rows are complex because they have correlations greater than .3 or less than -.3 in both columns.
library(palmerpenguins)
## Warning: package 'palmerpenguins' was built under R version 4.0.5
data(penguins)
pablo <- as.matrix(penguins[,which(sapply(penguins, is.numeric))])
Based on Joliffe’s criterion, the intrisic dimensionality is 3!
pengCor <- cor(pablo, use = "pairwise.complete.obs")
pengVal <- eigen(pengCor)$values
pengVec <- eigen(pengCor)$vectors
pengVal
## [1] 2.77008681 0.99348866 0.77191746 0.36520940 0.09929767
Lpeng <- diag(pengVal[1:3])
Vpeng <- pengVec[,(1:3)]
Lpeng
## [,1] [,2] [,3]
## [1,] 2.770087 0.0000000 0.0000000
## [2,] 0.000000 0.9934887 0.0000000
## [3,] 0.000000 0.0000000 0.7719175
Vpeng
## [,1] [,2] [,3]
## [1,] 0.45224564 -0.09346291 -0.592853269
## [2,] -0.39832420 0.01638435 -0.799846355
## [3,] 0.57595238 0.02563960 -0.005357699
## [4,] 0.54399668 -0.11141418 -0.078078773
## [5,] 0.09569837 0.98890127 -0.051437436
pengLoad <- Vpeng%*%sqrt(Lpeng)
pengLoad
## [,1] [,2] [,3]
## [1,] 0.7526986 -0.09315813 -0.520873964
## [2,] -0.6629540 0.01633092 -0.702735675
## [3,] 0.9585908 0.02555599 -0.004707212
## [4,] 0.9054051 -0.11105086 -0.068599099
## [5,] 0.1592763 0.98567648 -0.045192332
Cluster 1: dimensions 1, 2, 3, 4
Cluster 2: dimension 5
Cluster 3: 1, 2
Unfortunately there is complexity with dimensions 1 and 2 :(
poverty <- matrix(c(6.6, .2, .3, .5, .2, 7.1, .5, 9.9, .7, 0.0, 0.0, 0.0, 0.0, 0.0, 39.8, 0.0, 0.0, 0.2, 5.3, 0.0, 0.9, 0.3, 0.2, 45.4, 0.5, 10.6, 1.4, 0.0, 0.8, 1.9, 1.7, 3.4, 30.1, 1.4, 0.2, 0.5), nrow = 9, ncol = 4)
rownames(poverty) <- c("Estonia", "Luxembourg", "Belgium", "Greece", "Spain", "Djibouti", "Cyprus", "Lithuania", "Kosovo")
colnames(poverty) <- c("water", "Electricity", "Sanitation", "Education")
poverty
## water Electricity Sanitation Education
## Estonia 6.6 0.0 5.3 0.0
## Luxembourg 0.2 0.0 0.0 0.8
## Belgium 0.3 0.0 0.9 1.9
## Greece 0.5 0.0 0.3 1.7
## Spain 0.2 0.0 0.2 3.4
## Djibouti 7.1 39.8 45.4 30.1
## Cyprus 0.5 0.0 0.5 1.4
## Lithuania 9.9 0.0 10.6 0.2
## Kosovo 0.7 0.2 1.4 0.5
povCor <- cor(poverty)
povVal <- eigen(povCor)$values
povVec <- eigen(povCor)$vectors
povVal
## [1] 3.2262015143 0.7683207542 0.0045841059 0.0008936255
povVec
## [,1] [,2] [,3] [,4]
## [1,] -0.3360388 0.90949404 0.1628858 0.18266560
## [2,] -0.5423945 -0.25400214 -0.4463446 0.66488167
## [3,] -0.5563632 0.01291813 -0.4087539 -0.72333494
## [4,] -0.5323027 -0.32884135 0.7792079 -0.03677223
Yes, the Kaiser criterion supports this view because there is only one cluster for these 4 dimensions.
Lpov <- povVal[1]
Vpov <- as.matrix(povVec[,1])
povLoad <- Vpov%*%sqrt(Lpov)
povLoad
## [,1]
## [1,] -0.6035805
## [2,] -0.9742289
## [3,] -0.9993189
## [4,] -0.9561024
They all correlate to this factor very strongly. Water is the weakest one with a correlation of -.6 while all the others are at least -.95, but that is still pretty low below the -.3 cutoff.
outliers <- scale(poverty) < qnorm(.005) | scale(poverty) > qnorm(1-.005)
nas <- which(outliers)
poverty[nas] <- NA
poverty
## water Electricity Sanitation Education
## Estonia 6.6 0.0 5.3 0.0
## Luxembourg 0.2 0.0 0.0 0.8
## Belgium 0.3 0.0 0.9 1.9
## Greece 0.5 0.0 0.3 1.7
## Spain 0.2 0.0 0.2 3.4
## Djibouti 7.1 NA NA NA
## Cyprus 0.5 0.0 0.5 1.4
## Lithuania 9.9 0.0 10.6 0.2
## Kosovo 0.7 0.2 1.4 0.5
povCor <- cor(poverty, use = "pairwise.complete.obs")
povVal <- eigen(povCor)$values
povVec <- eigen(povCor)$vectors
povVal
## [1] 2.48342279 1.16723586 0.33874614 0.01059521
povVec
## [,1] [,2] [,3] [,4]
## [1,] 0.61610493 -0.1643111 -0.2439492 0.73068826
## [2,] -0.02951743 0.8934933 -0.4411726 0.07851911
## [3,] 0.61036331 -0.1175120 -0.3988288 -0.67422783
## [4,] -0.49699101 -0.4010767 -0.7660229 0.07311809
Previously we had 2 for our intrinsic dimensionality based on Kaiser, but now we only have 1. Thus, the univariate outliers ARE influential since it changed the intrinsic dimensionality.