Modul 1: Principal Component Analysis and Factor Analysis

Dataset –> Student Placement Link Dataset –> https://www.kaggle.com/datasets/sonalshinde123/student-placement-dataset

Variabel

X1 = Age X2 = CGPA X3 = Internship X4 = Projects X5 = Coding_Skills X6 = Communication_Skills X7 = Aptitude_Test_Score X8 = Soft_Skills_Rating X9 = Certifications X10 = Backlogs

Import Library

library(knitr)
library(rmarkdown)
library(prettydoc)
library(equatiomatic)
## 
## Attaching package: 'equatiomatic'
## The following object is masked from 'package:datasets':
## 
##     penguins
library(DT)
library(psych)
library(REdaS)
## Loading required package: grid
library(corrplot)
## corrplot 0.95 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library('FactoMineR')

Import Data

data <- read.csv("D:/BIODATA KAFKA/Sem 4/Analisis Multivariat/Student Placement.csv", header = TRUE, sep =",")
summary(data)
##    Student_ID         Age           Gender             Degree         
##  Min.   :   12   Min.   :18.00   Length:5000        Length:5000       
##  1st Qu.:12424   1st Qu.:19.00   Class :character   Class :character  
##  Median :25469   Median :21.00   Mode  :character   Mode  :character  
##  Mean   :25203   Mean   :21.01                                        
##  3rd Qu.:37718   3rd Qu.:23.00                                        
##  Max.   :49999   Max.   :24.00                                        
##     Branch               CGPA        Internships        Projects    
##  Length:5000        Min.   :4.500   Min.   :0.0000   Min.   :1.000  
##  Class :character   1st Qu.:6.330   1st Qu.:0.0000   1st Qu.:3.000  
##  Mode  :character   Median :7.010   Median :1.0000   Median :4.000  
##                     Mean   :7.005   Mean   :0.7706   Mean   :3.749  
##                     3rd Qu.:7.690   3rd Qu.:1.0000   3rd Qu.:4.000  
##                     Max.   :9.800   Max.   :3.0000   Max.   :6.000  
##  Coding_Skills    Communication_Skills Aptitude_Test_Score Soft_Skills_Rating
##  Min.   : 1.000   Min.   : 1.000       Min.   : 35.00      Min.   : 1.00     
##  1st Qu.: 4.000   1st Qu.: 4.000       1st Qu.: 60.00      1st Qu.: 5.00     
##  Median : 6.000   Median : 5.000       Median : 70.00      Median : 6.00     
##  Mean   : 5.721   Mean   : 5.484       Mean   : 69.52      Mean   : 5.51     
##  3rd Qu.: 7.000   3rd Qu.: 7.000       3rd Qu.: 79.00      3rd Qu.: 6.00     
##  Max.   :10.000   Max.   :10.000       Max.   :100.00      Max.   :10.00     
##  Certifications     Backlogs      Placement_Status  
##  Min.   :0.000   Min.   :0.0000   Length:5000       
##  1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :2.000   Median :1.0000   Mode  :character  
##  Mean   :1.807   Mean   :0.8698                     
##  3rd Qu.:2.000   3rd Qu.:2.0000                     
##  Max.   :3.000   Max.   :3.0000
data_numerik <- data[, !(names(data) %in% c("Student_ID", "Gender", "Degree", "Branch", "Placement_Status"))]
summary(data_numerik)
##       Age             CGPA        Internships        Projects    
##  Min.   :18.00   Min.   :4.500   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:19.00   1st Qu.:6.330   1st Qu.:0.0000   1st Qu.:3.000  
##  Median :21.00   Median :7.010   Median :1.0000   Median :4.000  
##  Mean   :21.01   Mean   :7.005   Mean   :0.7706   Mean   :3.749  
##  3rd Qu.:23.00   3rd Qu.:7.690   3rd Qu.:1.0000   3rd Qu.:4.000  
##  Max.   :24.00   Max.   :9.800   Max.   :3.0000   Max.   :6.000  
##  Coding_Skills    Communication_Skills Aptitude_Test_Score Soft_Skills_Rating
##  Min.   : 1.000   Min.   : 1.000       Min.   : 35.00      Min.   : 1.00     
##  1st Qu.: 4.000   1st Qu.: 4.000       1st Qu.: 60.00      1st Qu.: 5.00     
##  Median : 6.000   Median : 5.000       Median : 70.00      Median : 6.00     
##  Mean   : 5.721   Mean   : 5.484       Mean   : 69.52      Mean   : 5.51     
##  3rd Qu.: 7.000   3rd Qu.: 7.000       3rd Qu.: 79.00      3rd Qu.: 6.00     
##  Max.   :10.000   Max.   :10.000       Max.   :100.00      Max.   :10.00     
##  Certifications     Backlogs     
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :1.807   Mean   :0.8698  
##  3rd Qu.:2.000   3rd Qu.:2.0000  
##  Max.   :3.000   Max.   :3.0000

Asumsi Data

Korelasi

cor(data_numerik)
##                               Age        CGPA  Internships     Projects
## Age                   1.000000000  0.01101079  0.009868339  0.008996816
## CGPA                  0.011010795  1.00000000  0.373514922  0.700337915
## Internships           0.009868339  0.37351492  1.000000000  0.522191630
## Projects              0.008996816  0.70033792  0.522191630  1.000000000
## Coding_Skills         0.017931178  0.46758718  0.537636462  0.894409152
## Communication_Skills -0.002467067 -0.01133818 -0.007554366  0.001947031
## Aptitude_Test_Score   0.004893432  0.70182019  0.273366821  0.500845369
## Soft_Skills_Rating   -0.009540773 -0.03585377 -0.023266676 -0.026705362
## Certifications        0.008630309  0.54758873  0.477900513  0.827602624
## Backlogs             -0.018190546 -0.58427076 -0.220171779 -0.416125053
##                      Coding_Skills Communication_Skills Aptitude_Test_Score
## Age                    0.017931178         -0.002467067         0.004893432
## CGPA                   0.467587175         -0.011338179         0.701820191
## Internships            0.537636462         -0.007554366         0.273366821
## Projects               0.894409152          0.001947031         0.500845369
## Coding_Skills          1.000000000          0.004976343         0.337021335
## Communication_Skills   0.004976343          1.000000000        -0.008423186
## Aptitude_Test_Score    0.337021335         -0.008423186         1.000000000
## Soft_Skills_Rating    -0.022127375          0.005992770        -0.027359018
## Certifications         0.858133397         -0.001136872         0.528736700
## Backlogs              -0.289973406         -0.003013710        -0.409167795
##                      Soft_Skills_Rating Certifications     Backlogs
## Age                        -0.009540773    0.008630309 -0.018190546
## CGPA                       -0.035853772    0.547588730 -0.584270759
## Internships                -0.023266676    0.477900513 -0.220171779
## Projects                   -0.026705362    0.827602624 -0.416125053
## Coding_Skills              -0.022127375    0.858133397 -0.289973406
## Communication_Skills        0.005992770   -0.001136872 -0.003013710
## Aptitude_Test_Score        -0.027359018    0.528736700 -0.409167795
## Soft_Skills_Rating          1.000000000   -0.024712075  0.005512266
## Certifications             -0.024712075    1.000000000 -0.328503983
## Backlogs                    0.005512266   -0.328503983  1.000000000
corrplot(cor(data_numerik))

Eigen Value

eigen(cor(data_numerik))
## eigen() decomposition
## $values
##  [1] 4.18119960 1.12555352 1.01144173 0.99817547 0.99006213 0.63619131
##  [7] 0.58413171 0.29097113 0.12893176 0.05334163
## 
## $vectors
##               [,1]        [,2]         [,3]          [,4]         [,5]
##  [1,] -0.009258134  0.01770335  0.581666047  5.883050e-01  0.560483019
##  [2,] -0.395577131  0.39652918 -0.013096292  6.843211e-05 -0.012557841
##  [3,] -0.303524799 -0.32932746  0.028077447 -2.213220e-02  0.010181367
##  [4,] -0.454084731 -0.17274258 -0.009277584 -4.173397e-03  0.004870011
##  [5,] -0.413736063 -0.41136733  0.004844642 -8.920826e-04  0.017582199
##  [6,]  0.002029160 -0.05035715 -0.472849650  8.048772e-01 -0.353524591
##  [7,] -0.335821190  0.43510050 -0.019134184 -7.665428e-03 -0.022181226
##  [8,]  0.020095922 -0.01740675 -0.659293992 -6.150139e-02  0.747795496
##  [9,] -0.428159633 -0.24150677 -0.004689881 -1.088150e-02  0.005486329
## [10,]  0.278263791 -0.53334298  0.044178475 -4.002925e-02 -0.023764020
##               [,6]        [,7]         [,8]         [,9]         [,10]
##  [1,] -0.022911973  0.02077937  0.002629553 -0.002394337 -0.0072200776
##  [2,] -0.018067191  0.13464608  0.689022580 -0.348789757  0.2666303186
##  [3,]  0.729620313  0.50857954 -0.068313948 -0.034354664 -0.0344716673
##  [4,] -0.125527074 -0.17729244  0.329687327  0.393596759 -0.6730810145
##  [5,] -0.121093803 -0.27570860 -0.031872693  0.348294329  0.6680286218
##  [6,]  0.004029167  0.03144887  0.005437373 -0.005539157  0.0009131115
##  [7,] -0.346158602  0.54947570 -0.404074736  0.333109675  0.0317336768
##  [8,] -0.010911620  0.03674201  0.012572021 -0.003411459  0.0023094587
##  [9,] -0.291539218 -0.10390157 -0.384287838 -0.698458669 -0.1639237658
## [10,] -0.481067428  0.54797772  0.315790146 -0.045645293  0.0219854865

KMO/MSA

KMOS(data_numerik)
## 
## Kaiser-Meyer-Olkin Statistics
## 
## Call: KMOS(x = data_numerik)
## 
## Measures of Sampling Adequacy (MSA):
##                  Age                 CGPA          Internships 
##            0.3466923            0.7174396            0.9546544 
##             Projects        Coding_Skills Communication_Skills 
##            0.7581822            0.6908664            0.3464423 
##  Aptitude_Test_Score   Soft_Skills_Rating       Certifications 
##            0.7745864            0.7821473            0.8425321 
##             Backlogs 
##            0.8736235 
## 
## KMO-Criterion: 0.7747372

Bartlett

bart_spher(data_numerik)
##  Bartlett's Test of Sphericity
## 
## Call: bart_spher(x = data_numerik)
## 
##      X2 = 28248.231
##      df = 45
## p-value < 2.22e-16

PCA

pca <- prcomp(data_numerik, center = TRUE, scale. = TRUE)
pca
## Standard deviations (1, .., p=10):
##  [1] 2.0447982 1.0609211 1.0057046 0.9990873 0.9950187 0.7976160 0.7642851
##  [8] 0.5394174 0.3590707 0.2309581
## 
## Rotation (n x k) = (10 x 10):
##                               PC1         PC2          PC3           PC4
## Age                  -0.009258134  0.01770335  0.581666047 -5.883050e-01
## CGPA                 -0.395577131  0.39652918 -0.013096292 -6.843211e-05
## Internships          -0.303524799 -0.32932746  0.028077447  2.213220e-02
## Projects             -0.454084731 -0.17274258 -0.009277584  4.173397e-03
## Coding_Skills        -0.413736063 -0.41136733  0.004844642  8.920826e-04
## Communication_Skills  0.002029160 -0.05035715 -0.472849650 -8.048772e-01
## Aptitude_Test_Score  -0.335821190  0.43510050 -0.019134184  7.665428e-03
## Soft_Skills_Rating    0.020095922 -0.01740675 -0.659293992  6.150139e-02
## Certifications       -0.428159633 -0.24150677 -0.004689881  1.088150e-02
## Backlogs              0.278263791 -0.53334298  0.044178475  4.002925e-02
##                               PC5          PC6         PC7          PC8
## Age                  -0.560483019 -0.022911973 -0.02077937  0.002629553
## CGPA                  0.012557841 -0.018067191 -0.13464608  0.689022580
## Internships          -0.010181367  0.729620313 -0.50857954 -0.068313948
## Projects             -0.004870011 -0.125527074  0.17729244  0.329687327
## Coding_Skills        -0.017582199 -0.121093803  0.27570860 -0.031872693
## Communication_Skills  0.353524591  0.004029167 -0.03144887  0.005437373
## Aptitude_Test_Score   0.022181226 -0.346158602 -0.54947570 -0.404074736
## Soft_Skills_Rating   -0.747795496 -0.010911620 -0.03674201  0.012572021
## Certifications       -0.005486329 -0.291539218  0.10390157 -0.384287838
## Backlogs              0.023764020 -0.481067428 -0.54797772  0.315790146
##                               PC9          PC10
## Age                   0.002394337  0.0072200776
## CGPA                  0.348789757 -0.2666303186
## Internships           0.034354664  0.0344716673
## Projects             -0.393596759  0.6730810145
## Coding_Skills        -0.348294329 -0.6680286218
## Communication_Skills  0.005539157 -0.0009131115
## Aptitude_Test_Score  -0.333109675 -0.0317336768
## Soft_Skills_Rating    0.003411459 -0.0023094587
## Certifications        0.698458669  0.1639237658
## Backlogs              0.045645293 -0.0219854865
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     2.0448 1.0609 1.0057 0.99909 0.99502 0.79762 0.76429
## Proportion of Variance 0.4181 0.1126 0.1011 0.09982 0.09901 0.06362 0.05841
## Cumulative Proportion  0.4181 0.5307 0.6318 0.73164 0.83064 0.89426 0.95268
##                           PC8     PC9    PC10
## Standard deviation     0.5394 0.35907 0.23096
## Proportion of Variance 0.0291 0.01289 0.00533
## Cumulative Proportion  0.9818 0.99467 1.00000

PCA Principal

pc_pri_non <- principal(data_numerik, nfactors = ncol(data_numerik), rotate = "none")
pc_pri_non
## Principal Components Analysis
## Call: principal(r = data_numerik, nfactors = ncol(data_numerik), rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9
## Age                   0.02 -0.02 -0.58  0.59  0.56  0.02  0.02  0.00  0.00
## CGPA                  0.81 -0.42  0.01  0.00 -0.01  0.01  0.10  0.37  0.13
## Internships           0.62  0.35 -0.03 -0.02  0.01 -0.58  0.39 -0.04  0.01
## Projects              0.93  0.18  0.01  0.00  0.00  0.10 -0.14  0.18 -0.14
## Coding_Skills         0.85  0.44  0.00  0.00  0.02  0.10 -0.21 -0.02 -0.13
## Communication_Skills  0.00  0.05  0.48  0.80 -0.35  0.00  0.02  0.00  0.00
## Aptitude_Test_Score   0.69 -0.46  0.02 -0.01 -0.02  0.28  0.42 -0.22 -0.12
## Soft_Skills_Rating   -0.04  0.02  0.66 -0.06  0.74  0.01  0.03  0.01  0.00
## Certifications        0.88  0.26  0.00 -0.01  0.01  0.23 -0.08 -0.21  0.25
## Backlogs             -0.57  0.57 -0.04 -0.04 -0.02  0.38  0.42  0.17  0.02
##                       PC10 h2       u2 com
## Age                   0.00  1 -2.2e-16 3.0
## CGPA                  0.06  1  3.1e-15 2.1
## Internships          -0.01  1 -4.4e-16 3.3
## Projects             -0.16  1  3.1e-15 1.3
## Coding_Skills         0.15  1  1.7e-15 1.8
## Communication_Skills  0.00  1  0.0e+00 2.1
## Aptitude_Test_Score   0.01  1  2.3e-15 3.3
## Soft_Skills_Rating    0.00  1  1.0e-15 2.0
## Certifications       -0.04  1  1.8e-15 1.7
## Backlogs              0.01  1  8.9e-16 3.8
## 
##                        PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10
## SS loadings           4.18 1.13 1.01 1.00 0.99 0.64 0.58 0.29 0.13 0.05
## Proportion Var        0.42 0.11 0.10 0.10 0.10 0.06 0.06 0.03 0.01 0.01
## Cumulative Var        0.42 0.53 0.63 0.73 0.83 0.89 0.95 0.98 0.99 1.00
## Proportion Explained  0.42 0.11 0.10 0.10 0.10 0.06 0.06 0.03 0.01 0.01
## Cumulative Proportion 0.42 0.53 0.63 0.73 0.83 0.89 0.95 0.98 0.99 1.00
## 
## Mean item complexity =  2.4
## Test of the hypothesis that 10 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
##  with the empirical chi square  0  with prob <  NA 
## 
## Fit based upon off diagonal values = 1
pc_pri <- principal(data_numerik, nfactors = 3, rotate = "none")
pc_pri
## Principal Components Analysis
## Call: principal(r = data_numerik, nfactors = 3, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1   PC2   PC3   h2    u2 com
## Age                   0.02 -0.02 -0.58 0.34 0.657 1.0
## CGPA                  0.81 -0.42  0.01 0.83 0.169 1.5
## Internships           0.62  0.35 -0.03 0.51 0.492 1.6
## Projects              0.93  0.18  0.01 0.90 0.104 1.1
## Coding_Skills         0.85  0.44  0.00 0.91 0.094 1.5
## Communication_Skills  0.00  0.05  0.48 0.23 0.771 1.0
## Aptitude_Test_Score   0.69 -0.46  0.02 0.68 0.315 1.8
## Soft_Skills_Rating   -0.04  0.02  0.66 0.44 0.558 1.0
## Certifications        0.88  0.26  0.00 0.83 0.168 1.2
## Backlogs             -0.57  0.57 -0.04 0.65 0.354 2.0
## 
##                        PC1  PC2  PC3
## SS loadings           4.18 1.13 1.01
## Proportion Var        0.42 0.11 0.10
## Cumulative Var        0.42 0.53 0.63
## Proportion Explained  0.66 0.18 0.16
## Cumulative Proportion 0.66 0.84 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.07 
##  with the empirical chi square  2364.78  with prob <  0 
## 
## Fit based upon off diagonal values = 0.96

PCA FactoMineR

pca_result <- PCA(data_numerik,
                  scale.unit = FALSE,   # karena sudah di-scale
                  graph = FALSE,
                  ncp = ncol(data_numerik))
pca_result$eig
##           eigenvalue percentage of variance cumulative percentage of variance
## comp 1  194.10862026            93.18112175                          93.18112
## comp 2    4.53716297             2.17804822                          95.35917
## comp 3    3.97031712             1.90593598                          97.26511
## comp 4    2.32444172             1.11583961                          98.38095
## comp 5    1.54377845             0.74108511                          99.12203
## comp 6    0.87014919             0.41771188                          99.53974
## comp 7    0.48812461             0.23432240                          99.77406
## comp 8    0.32570236             0.15635221                          99.93042
## comp 9    0.07883831             0.03784604                          99.96826
## comp 10   0.06611196             0.03173680                         100.00000

ScreePlot

fviz_eig(pca,
         addlabels = TRUE,
         ncp = ncol(data_numerik),
         barfill = "skyblue",
         barcolor = "darkblue",
         linecolor = "red")
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.

Biplot

fviz_pca_biplot(pca_result,
                geom.ind = "point",
                #col.ind = status.ipm,
                #palette = c("#FC4E07","#E7B800", "#00AFBB"),
                addEllipses = TRUE,)

Correlation Circle

contrib_circle <- fviz_pca_var(pca_result, col.var = "contrib",
                               gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
                               repel = TRUE) +
  ggtitle("correlation circle")
plot(contrib_circle)

Variable Contribution PC1-PC3

contrib_v_PC1 <- fviz_contrib(pca_result, choice = "var", axes = 1, top = 5) + ggtitle("PC1")
plot(contrib_v_PC1)

contrib_v_PC2 <- fviz_contrib(pca_result, choice = "var", axes = 2, top = 5) + ggtitle("PC2")
plot(contrib_v_PC2)

contrib_v_PC3 <- fviz_contrib(pca_result, choice = "var", axes = 3, top = 5) + ggtitle("PC3")
plot(contrib_v_PC3)

FA Principal

fa <- fa(cor(data_numerik), nfactors = 3, rotate = "varimax")
fa
## Factor Analysis using method =  minres
## Call: fa(r = cor(data_numerik), nfactors = 3, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        MR1   MR2   MR3      h2     u2 com
## Age                   0.01  0.00  0.02 0.00034 0.9997 1.6
## CGPA                  0.34  0.61  0.72 0.99639 0.0036 2.4
## Internships           0.50  0.08  0.22 0.30409 0.6959 1.4
## Projects              0.83  0.18  0.43 0.90560 0.0944 1.6
## Coding_Skills         0.97 -0.06  0.24 0.99597 0.0040 1.1
## Communication_Skills  0.00 -0.01  0.00 0.00013 0.9999 1.3
## Aptitude_Test_Score   0.40  0.91  0.02 0.99501 0.0050 1.4
## Soft_Skills_Rating   -0.02 -0.02 -0.01 0.00109 0.9989 2.7
## Certifications        0.85  0.20  0.19 0.80334 0.1967 1.2
## Backlogs             -0.21 -0.35 -0.42 0.34152 0.6585 2.4
## 
##                        MR1  MR2  MR3
## SS loadings           2.92 1.41 1.01
## Proportion Var        0.29 0.14 0.10
## Cumulative Var        0.29 0.43 0.53
## Proportion Explained  0.55 0.26 0.19
## Cumulative Proportion 0.55 0.81 1.00
## 
## Mean item complexity =  1.7
## Test of the hypothesis that 3 factors are sufficient.
## 
## df null model =  45  with the objective function =  5.66
## df of  the model are 18  and the objective function was  0.01 
## 
## The root mean square of the residuals (RMSR) is  0 
## The df corrected root mean square of the residuals is  0 
## 
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    MR1  MR2  MR3
## Correlation of (regression) scores with factors   1.00 1.00 0.99
## Multiple R square of scores with factors          0.99 0.99 0.99
## Minimum correlation of possible factor scores     0.99 0.99 0.98
loads=fa$loadings
fa.diagram(fa)