Data Visualization Home work 7

Antanas Kaminskas

2022-05-09

Task

Multidimensional data: projections:

  1. Find principal components of your data (use: Matlab: pca, R: princomp, or other).

  2. Make visualizations in two projection.

  3. Make the attribute axis representation.

  4. Select the most informative/informative attributes possible according to the shortest / longest axes

  5. Create different subsets of attributes (informative only, non-informative only, no non-informative attributies, etc.) and visualize the data using the nonlinear projection method - MDS

  6. Present different visualizations, comment on the result.

library(stats)
library(pracma)
## Warning: paketas 'pracma' buvo sukurtas pagal R versiją 4.1.3
library(ggplot2)
library(plot3D)
library(rgl)
library(viscomplexr)
library(philentropy)
## Warning: paketas 'philentropy' buvo sukurtas pagal R versiją 4.1.3
library(factoextra)
## Warning: paketas 'factoextra' buvo sukurtas pagal R versiją 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(PerformanceAnalytics)
## Warning: paketas 'PerformanceAnalytics' buvo sukurtas pagal R versiją 4.1.3
## Įkeliamas reikalingas paketas: xts
## Įkeliamas reikalingas paketas: zoo
## 
## Pridedamas paketas: 'zoo'
## Šie objektai yra užmaskuoti nuo 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Pridedamas paketas: 'PerformanceAnalytics'
## Šis objektas yra užmaskuotas nuo 'package:graphics':
## 
##     legend

Data

x <- seq(-40, 40, length = 100)
y <- seq(-10, 10 ,len = 100)
s <- complex(real = x, imaginary = y)
z <- zeta(x + y*1i)
m <- abs(z)


dff <- data.frame(Re(s), as.numeric(Im(s)), as.numeric(z), m)
## Warning in data.frame(Re(s), as.numeric(Im(s)), as.numeric(z), m): menamosios
## dalys atmestos keičiant duomenų tipą
head(dff)
##       Re.s. as.numeric.Im.s.. as.numeric.z.            m
## 1 -40.00000        -10.000000 -3.451914e+20 3.001451e+21
## 2 -39.19192         -9.797980  2.738126e+20 5.011015e+20
## 3 -38.38384         -9.595960  8.136942e+19 8.504820e+19
## 4 -37.57576         -9.393939  1.348245e+19 1.467905e+19
## 5 -36.76768         -9.191919  1.139870e+18 2.577382e+18
## 6 -35.95960         -8.989899 -1.150264e+17 4.605423e+17

Calculated principal components and its parameters

PC <- prcomp(dff, scale = TRUE)
PC
## Standard deviations (1, .., p=4):
## [1] 1.453075e+00 1.254517e+00 5.610359e-01 7.788648e-17
## 
## Rotation (n x k) = (4 x 4):
##                          PC1        PC2       PC3           PC4
## Re.s.              0.6440319 -0.2781415 0.0886577  7.071068e-01
## as.numeric.Im.s..  0.6440319 -0.2781415 0.0886577 -7.071068e-01
## as.numeric.z.      0.2056652  0.6956227 0.6883393 -1.543904e-16
## m                 -0.3579770 -0.6011520 0.7144709  5.551115e-16
PC$sdev
## [1] 1.453075e+00 1.254517e+00 5.610359e-01 7.788648e-17
PC$rotation
##                          PC1        PC2       PC3           PC4
## Re.s.              0.6440319 -0.2781415 0.0886577  7.071068e-01
## as.numeric.Im.s..  0.6440319 -0.2781415 0.0886577 -7.071068e-01
## as.numeric.z.      0.2056652  0.6956227 0.6883393 -1.543904e-16
## m                 -0.3579770 -0.6011520 0.7144709  5.551115e-16
PC$center
##             Re.s. as.numeric.Im.s..     as.numeric.z.                 m 
##     -5.329071e-16     -1.332268e-16      2.441164e+17      3.605420e+19
PC$scale
##             Re.s. as.numeric.Im.s..     as.numeric.z.                 m 
##      2.344363e+01      5.860907e+00      4.505088e+19      3.038021e+20
head(PC$x)
##            PC1         PC2        PC3          PC4
## [1,] -7.268890 -10.2524823  1.3934124 7.934498e-15
## [2,] -1.452410   4.2338715  4.9771493 1.022439e-15
## [3,] -1.796304   2.0664863  1.0644330 9.217273e-16
## [4,] -1.978904   1.1383247 -0.1322022 1.025798e-15
## [5,] -1.976592   0.9525168 -0.3431348 8.239393e-16
## [6,] -1.935428   0.9181543 -0.3611749 8.243720e-16
# Eigenvalues
eig.val <- get_eigenvalue(PC)
eig.val
##         eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.111427e+00     5.278567e+01                    52.78567
## Dim.2 1.573812e+00     3.934529e+01                    92.13097
## Dim.3 3.147613e-01     7.869032e+00                   100.00000
## Dim.4 6.066303e-33     1.516576e-31                   100.00000
# Results for Variables
res.var <- get_pca_var(PC)
res.var$coord    # Coordinates
##                        Dim.1      Dim.2      Dim.3         Dim.4
## Re.s.              0.9358267 -0.3489331 0.04974015  5.507406e-17
## as.numeric.Im.s..  0.9358267 -0.3489331 0.04974015 -5.507406e-17
## as.numeric.z.      0.2988469  0.8726701 0.38618305 -1.202492e-32
## m                 -0.5201675 -0.7541552 0.40084384  4.323568e-32
res.var$contrib        # Contributions to the PCs
##                       Dim.1     Dim.2      Dim.3        Dim.4
## Re.s.             41.477714  7.736267  0.7860188 5.000000e+01
## as.numeric.Im.s.. 41.477714  7.736267  0.7860188 5.000000e+01
## as.numeric.z.      4.229816 48.389090 47.3810942 2.383639e-30
## m                 12.814756 36.138376 51.0468683 3.081488e-29
res.var$cos2           # Quality of representation 
##                        Dim.1     Dim.2       Dim.3        Dim.4
## Re.s.             0.87577164 0.1217543 0.002474083 3.033152e-33
## as.numeric.Im.s.. 0.87577164 0.1217543 0.002474083 3.033152e-33
## as.numeric.z.     0.08930947 0.7615532 0.149137345 1.445988e-64
## m                 0.27057422 0.5687500 0.160675783 1.869324e-63
# Results for individuals
res.ind <- get_pca_ind(PC)
head(res.ind$coord)          # Coordinates
##       Dim.1       Dim.2      Dim.3        Dim.4
## 1 -7.268890 -10.2524823  1.3934124 7.934498e-15
## 2 -1.452410   4.2338715  4.9771493 1.022439e-15
## 3 -1.796304   2.0664863  1.0644330 9.217273e-16
## 4 -1.978904   1.1383247 -0.1322022 1.025798e-15
## 5 -1.976592   0.9525168 -0.3431348 8.239393e-16
## 6 -1.935428   0.9181543 -0.3611749 8.243720e-16
head(res.ind$contrib)        # Contributions to the PCs
##        Dim.1      Dim.2       Dim.3      Dim.4
## 1 25.0241982 66.7890511  6.16847827 10378.0275
## 2  0.9990855 11.3899696 78.70095735   172.3259
## 3  1.5282114  2.7133902  3.59960902   140.0492
## 4  1.8546979  0.8233407  0.05552595   173.4601
## 5  1.8503666  0.5764910  0.37406595   111.9093
## 6  1.7740988  0.5356469  0.41443260   112.0269
head(res.ind$cos2)           # Quality of representation 
##        Dim.1     Dim.2       Dim.3        Dim.4
## 1 0.33045335 0.6574034 0.012143204 3.937430e-31
## 2 0.04707942 0.4000624 0.552858171 2.333066e-32
## 3 0.37389028 0.4948228 0.131286885 9.844407e-32
## 4 0.74886588 0.2477919 0.003342198 2.012234e-31
## 5 0.79216515 0.1839616 0.023873241 1.376488e-31
## 6 0.79373051 0.1786284 0.027641045 1.440010e-31

Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.

fviz_eig(PC)

Graph of individuals. Individuals with a similar profile are grouped together.

fviz_pca_ind(PC,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)
## Warning: ggrepel: 97 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Graph of variables. Positive correlated variables point to the same side of the plot. Negative correlated variables point to opposite sides of the graph.

fviz_pca_var(PC,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

Biplot of individuals and variables

fviz_pca_biplot(PC, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
)
## Warning: ggrepel: 97 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Collerogram

M <- cor(dff)
M
##                          Re.s. as.numeric.Im.s.. as.numeric.z.          m
## Re.s.              1.000000000       1.000000000  -0.005625753 -0.2036989
## as.numeric.Im.s..  1.000000000       1.000000000  -0.005625753 -0.2036989
## as.numeric.z.     -0.005625753      -0.005625753   1.000000000 -0.6587800
## m                 -0.203698929      -0.203698929  -0.658780038  1.0000000
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

PC when removed non-informative attributes

PCC <- prcomp(dff[,1:3], scale = TRUE)
PCC
## Standard deviations (1, .., p=3):
## [1] 1.414236e+00 9.999684e-01 4.080012e-16
## 
## Rotation (n x k) = (3 x 3):
##                            PC1         PC2           PC3
## Re.s.              0.707084406 0.005625219  7.071068e-01
## as.numeric.Im.s..  0.707084406 0.005625219 -7.071068e-01
## as.numeric.z.     -0.007955261 0.999968356 -5.100087e-16
PCC$sdev
## [1] 1.414236e+00 9.999684e-01 4.080012e-16
PCC$rotation
##                            PC1         PC2           PC3
## Re.s.              0.707084406 0.005625219  7.071068e-01
## as.numeric.Im.s..  0.707084406 0.005625219 -7.071068e-01
## as.numeric.z.     -0.007955261 0.999968356 -5.100087e-16
PCC$center
##             Re.s. as.numeric.Im.s..     as.numeric.z. 
##     -5.329071e-16     -1.332268e-16      2.441164e+17
PCC$scale
##             Re.s. as.numeric.Im.s..     as.numeric.z. 
##      2.344363e+01      5.860907e+00      4.505088e+19
head(PCC$x)
##            PC1          PC2           PC3
## [1,] -2.351885 -7.686629210  5.020805e-15
## [2,] -2.412446  6.053433982 -1.986771e-15
## [3,] -2.329719  1.782271526  1.918255e-16
## [4,] -2.268986  0.275811386  7.383108e-16
## [5,] -2.218061  0.002237966  8.780378e-16
## [6,] -2.169095 -0.025228451  1.114289e-15
# Eigenvalues
eig.val <- get_eigenvalue(PCC)
eig.val
##         eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.000063e+00     6.666878e+01                    66.66878
## Dim.2 9.999367e-01     3.333122e+01                   100.00000
## Dim.3 1.664650e-31     5.548833e-30                   100.00000
# Results for Variables
res.var <- get_pca_var(PCC)
res.var$coord    # Coordinates
##                         Dim.1       Dim.2         Dim.3
## Re.s.              0.99998418 0.005625041  2.885004e-16
## as.numeric.Im.s..  0.99998418 0.005625041 -2.885004e-16
## as.numeric.z.     -0.01125062 0.999936710 -2.080842e-31
res.var$contrib        # Contributions to the PCs
##                          Dim.1        Dim.2        Dim.3
## Re.s.             49.996835691  0.003164309 5.000000e+01
## as.numeric.Im.s.. 49.996835691  0.003164309 5.000000e+01
## as.numeric.z.      0.006328619 99.993671381 2.601089e-29
res.var$cos2           # Quality of representation 
##                          Dim.1        Dim.2        Dim.3
## Re.s.             0.9999683589 3.164109e-05 8.323250e-32
## as.numeric.Im.s.. 0.9999683589 3.164109e-05 8.323250e-32
## as.numeric.z.     0.0001265764 9.998734e-01 4.329902e-62
# Results for individuals
res.ind <- get_pca_ind(PCC)
head(res.ind$coord)          # Coordinates
##       Dim.1        Dim.2         Dim.3
## 1 -2.351885 -7.686629210  5.020805e-15
## 2 -2.412446  6.053433982 -1.986771e-15
## 3 -2.329719  1.782271526  1.918255e-16
## 4 -2.268986  0.275811386  7.383108e-16
## 5 -2.218061  0.002237966  8.780378e-16
## 6 -2.169095 -0.025228451  1.114289e-15
head(res.ind$contrib)        # Contributions to the PCs
##      Dim.1        Dim.2       Dim.3
## 1 2.765595 5.908801e+01 151.4341054
## 2 2.909857 3.664638e+01  23.7122503
## 3 2.713709 3.176693e+00   0.2210497
## 4 2.574067 7.607674e-02   3.2745794
## 5 2.459820 5.008811e-06   4.6313065
## 6 2.352411 6.365150e-04   7.4588621
head(res.ind$cos2)           # Quality of representation 
##        Dim.1        Dim.2        Dim.3
## 1 0.08560412 9.143959e-01 3.901297e-31
## 2 0.13705499 8.629450e-01 9.295553e-32
## 3 0.63081572 3.691843e-01 4.276696e-33
## 4 0.98543902 1.456098e-02 1.043385e-31
## 5 0.99999898 1.018028e-06 1.567036e-31
## 6 0.99986474 1.352588e-04 2.638638e-31

Visualize eigenvalues (scree plot). Show the percentage of variances explained by each principal component.

fviz_eig(PCC)

Graph of individuals. Individuals with a similar profile are grouped together.

fviz_pca_ind(PCC,
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)
## Warning: ggrepel: 93 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Graph of variables. Positive correlated variables point to the same side of the plot. Negative correlated variables point to opposite sides of the graph.

fviz_pca_var(PCC,
             col.var = "contrib", # Color by contributions to the PC
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

Biplot of individuals and variables

fviz_pca_biplot(PCC, repel = TRUE,
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969"  # Individuals color
)
## Warning: ggrepel: 95 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Collerogram

M <- cor(dff[,1:3])
M
##                          Re.s. as.numeric.Im.s.. as.numeric.z.
## Re.s.              1.000000000       1.000000000  -0.005625753
## as.numeric.Im.s..  1.000000000       1.000000000  -0.005625753
## as.numeric.z.     -0.005625753      -0.005625753   1.000000000
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

Conclutions

We can see that by not including non-informative attributes in the attribute, we get the most accurate representation of the data with Real part, Imaginary part and zeta function values. Which means attribute $ z $ modulus is not informative with PC. Only with PC we can represent more 66 % of data with PC1 and more then 33 % PC2.