setwd("C:/Users/Sam/Documents/DSCI_605/Module_6/")

# install.packages("FactoMineR")
# install.packages("factoextra")
library(FactoMineR)
library(factoextra)

Please use the sample data provided to make your own PCA analysis.

df <- read.csv("C:/Users/Sam/Documents/DSCI_605/Module_6/Samples-1.csv", header = TRUE)

You need to have some descriptions about the data you used and the variable you choose.

This data was asked to be used “Please use the sample data provided to make your own PCA analysis.” It seems to be some type of scientific experiment using multiple species of some type of organism with tests collected from multiple sites.

# All Variables, excluding non-numeric were selected.
df <- df[,2:8]

Please tell why you choose these variables for visualization.

Each species is assumed to be equally important so all were chosen, but dimension reduction will help make things more clear.

Calculate PCA using packages

df.pca <- PCA(df, scale.unit = TRUE, graph = FALSE, ncp = 7)
print(df.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 6 individuals, described by 7 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"
eig.val <- get_eigenvalue(df.pca)
eig.val
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1 3.964859352       56.6408479                    56.64085
## Dim.2 2.048397556       29.2628222                    85.90367
## Dim.3 0.599987080        8.5712440                    94.47491
## Dim.4 0.377301937        5.3900277                    99.86494
## Dim.5 0.009454076        0.1350582                   100.00000

Please use fviz_eig() to get the percentage of explained variances;

Try to make your graph more professional since this course is about data visualization.

fviz_eig(df.pca, addlabels = TRUE, ylim = c(0, 60))

var <- get_pca_var(df.pca)
var
## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"
ind <- get_pca_ind(df.pca)
ind
## Principal Component Analysis Results for individuals
##  ===================================================
##   Name       Description                       
## 1 "$coord"   "Coordinates for the individuals" 
## 2 "$cos2"    "Cos2 for the individuals"        
## 3 "$contrib" "contributions of the individuals"
# Coordinates
head(var$coord)
##                Dim.1       Dim.2        Dim.3       Dim.4        Dim.5
## speciesA -0.85923096 -0.01075314 -0.169120077  0.48205532  0.025051742
## speciesB  0.92561001 -0.36725561 -0.003195971  0.08104185 -0.042325249
## speciesC -0.04812317  0.85415477 -0.470162856 -0.21657288  0.012118959
## speciesD -0.60591717  0.54074830  0.576607174 -0.08881910  0.009538841
## speciesE  0.96101189 -0.25397146  0.068339245 -0.02478600  0.081670367
## speciesF  0.73812254  0.63668651  0.074816996  0.21016789 -0.006105393
#Cos2: quality on the factor map
head(var$cos2)
##                Dim.1      Dim.2        Dim.3        Dim.4        Dim.5
## speciesA 0.738277847 0.00011563 2.860160e-02 0.2323773331 6.275898e-04
## speciesB 0.856753892 0.13487669 1.021423e-05 0.0065677818 1.791427e-03
## speciesC 0.002315839 0.72958037 2.210531e-01 0.0469038128 1.468692e-04
## speciesD 0.367135619 0.29240873 3.324758e-01 0.0078888318 9.098949e-05
## speciesE 0.923543849 0.06450150 4.670252e-03 0.0006143457 6.670049e-03
## speciesF 0.544824887 0.40536971 5.597583e-03 0.0441705417 3.727583e-05
# Contributions to the principal components
head(var$contrib)
##                Dim.1        Dim.2        Dim.3     Dim.4      Dim.5
## speciesA 18.62053055  0.005644899  4.767036054 61.589223  6.6382988
## speciesB 21.60868309  6.584497463  0.001702408  1.740723 18.9487241
## speciesC  0.05840912 35.617127419 36.842978567 12.431373  1.5535012
## speciesD  9.25973878 14.274998802 55.413832109  2.090854  0.9624366
## speciesE 23.29323103  3.148876258  0.778392161  0.162826 70.5520999
## speciesF 13.74134208 19.789601479  0.932950561 11.706948  0.3942832
# head(var$cood, 3)

Please use fviz_pca_var() to show the variables;

fviz_pca_var(df.pca, col.var = "black")

As suspected, each is showing importance. Highest importance variables are showing B, E, F, and G.

Please use corrplot() to graphically display of a correlation matrix;

library("corrplot")
## corrplot 0.90 loaded
corrplot(var$cos2, is.corr = FALSE)

fviz_cos2(df.pca, choice = "var", axes = 1:2)

head(ind$coord)
##        Dim.1     Dim.2      Dim.3        Dim.4       Dim.5
## 1  1.1951413 -1.320730 -1.2905781 -0.002962978  0.09781942
## 2  1.2876107 -1.254958  0.1164269  0.136953784 -0.18805503
## 3  2.9904266  1.051379  0.9465344  0.147367970  0.07863657
## 4 -2.8184606 -1.609609  0.8631057  0.133677885  0.06469264
## 5 -0.9447908  1.140238 -0.1202938 -1.236919052 -0.02401635
## 6 -1.7099272  1.993680 -0.5151951  0.821882392 -0.02907725
# Quality of individuals
head(ind$cos2)
##       Dim.1     Dim.2       Dim.3        Dim.4        Dim.5
## 1 0.2946378 0.3598139 0.343572684 1.810952e-06 0.0019737866
## 2 0.5023248 0.4771706 0.004106977 5.682814e-03 0.0107148307
## 3 0.8150521 0.1007482 0.081656707 1.979362e-03 0.0005635964
## 4 0.7028866 0.2292461 0.065915766 1.581177e-03 0.0003703148
## 5 0.2388122 0.3478376 0.003871432 4.093245e-01 0.0001543118
## 6 0.3729225 0.5069604 0.033853748 8.615545e-02 0.0001078376
# Contributions of individuals
head(ind$contrib)
##       Dim.1     Dim.2      Dim.3        Dim.4     Dim.5
## 1  6.004259 14.192615 46.2674395 3.878079e-04 16.868632
## 2  6.969316 12.814238  0.3765423 8.285291e-01 62.344708
## 3 37.591292  8.993997 24.8874054 9.593253e-01 10.901313
## 4 33.392190 21.080222 20.6935414 7.893668e-01  7.378013
## 5  3.752255 10.578542  0.4019697 6.758375e+01  1.016819
## 6 12.290688 32.340386  7.3731017 2.983864e+01  1.490515
df.desc <- dimdesc(df.pca, axes = c(1,2), proba = .05)
df.desc$Dim.1
## $quanti
##          correlation     p.value
## speciesE   0.9610119 0.002250477
## speciesB   0.9256100 0.008094973
## speciesA  -0.8592310 0.028329149
## 
## attr(,"class")
## [1] "condes" "list"
df.desc$Dim.2
## $quanti
##          correlation    p.value
## speciesC   0.8541548 0.03035512
## 
## attr(,"class")
## [1] "condes" "list"

Please use fviz_pca_ind() to show individuals.

fviz_pca_ind(df.pca)

fviz_pca_ind(df.pca, col.ind = "cos2",
             gradient.cols = c("red", "blue", "green"),
             repel =  TRUE)

fviz_cos2(df.pca, choice = "ind")

fviz_contrib(df.pca, choice = "ind", axes = 1:2)

set.seed(123)
my.cont.var <- rnorm(6)

fviz_pca_ind(df.pca, col.ind = my.cont.var,
             gradient.cols = c("red", "blue", "green"),
             legend.title = "Cont.Var")

fviz_pca_biplot(df.pca, repel = TRUE,
                col.var = "red",
                col.ind = "green")