Praktikum 12 STA582

Principal Component Analysis

Prinsip utama dalam Principal Component Analysis (PCA) yaitu mentransfer satu himpunan peubah yang saling berkorelasi ke dalam himpunan peubah baru yang tidak berkorelasi dan memetakan data tersebut ke dalam dimensi yang lebih rendah. Komponen utama pertama merupakan arah keberagaman terbesar dalam data, sementra komponen utama kedua adalah arah ortogonal (tidak berkorelasi) dari keberagaman terbesar, dan seterusnya. Dalam R PCA dapat dilakukan menggunakan fungsi prcomp.

Data Breast Cancer

Data yang digunakan dalam studi ini adalah data Breast Cancer Winconsin (Diagnostic) Data Set (WDBC) yang dapat diakses pada laman https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)

Peubah data merupakan hasil ekstraksi fitur dari digitized image of a fine needle aspirate (FNA) of a breast mass yang mendeskripsikan 3 sel nukleus. Masing-masing sel nukleus memiliki 3 fitur, sehingga total terdapat 30 fitur (peubah), dengan peubah respon yaitu prognosis (benign (1) dan malignant (2))

  1. radius (mean of distances from center to points on the perimeter)
  2. texture (standard deviation of gray-scale values)
  3. perimeter
  4. area
  5. smoothness (local variation in radius lengths)
  6. compactness (perimeter^2 / area - 1.0)
  7. concavity (severity of h. concave portions of the contour)
  8. concave points (number of concave portions of the contour)
  9. symmetry
  10. fractal dimension (“coastline approximation” - 1)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(imager)
## Loading required package: magrittr
## 
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
## 
##     add
## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum
## The following object is masked from 'package:graphics':
## 
##     frame
## The following object is masked from 'package:base':
## 
##     save.image
library(rgl)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:imager':
## 
##     highlight
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
cancer<-read.csv('D:/MAGISTER (S2)/STA582 (Pembelajaran Mesin Statistika)/Praktikum/breast-cancer-wisconsin.csv')
head(cancer)
##      V1    V2     V3     V4      V5      V6     V7      V8     V9     V10
## 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871
## 2 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667
## 3 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999
## 4 11.42 20.38  77.58  386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744
## 5 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883
## 6 12.45 15.70  82.57  477.1 0.12780 0.17000 0.1578 0.08089 0.2087 0.07613
##      V11    V12   V13    V14      V15     V16     V17     V18     V19      V20
## 1 1.0950 0.9053 8.589 153.40 0.006399 0.04904 0.05373 0.01587 0.03003 0.006193
## 2 0.5435 0.7339 3.398  74.08 0.005225 0.01308 0.01860 0.01340 0.01389 0.003532
## 3 0.7456 0.7869 4.585  94.03 0.006150 0.04006 0.03832 0.02058 0.02250 0.004571
## 4 0.4956 1.1560 3.445  27.23 0.009110 0.07458 0.05661 0.01867 0.05963 0.009208
## 5 0.7572 0.7813 5.438  94.44 0.011490 0.02461 0.05688 0.01885 0.01756 0.005115
## 6 0.3345 0.8902 2.217  27.19 0.007510 0.03345 0.03672 0.01137 0.02165 0.005082
##     V21   V22    V23    V24    V25    V26    V27    V28    V29     V30 Class
## 1 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890     2
## 2 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902     2
## 3 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758     2
## 4 14.91 26.50  98.87  567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300     2
## 5 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678     2
## 6 15.47 23.75 103.40  741.6 0.1791 0.5249 0.5355 0.1741 0.3985 0.12440     2
str(cancer)
## 'data.frame':    569 obs. of  31 variables:
##  $ V1   : num  18 20.6 19.7 11.4 20.3 ...
##  $ V2   : num  10.4 17.8 21.2 20.4 14.3 ...
##  $ V3   : num  122.8 132.9 130 77.6 135.1 ...
##  $ V4   : num  1001 1326 1203 386 1297 ...
##  $ V5   : num  0.1184 0.0847 0.1096 0.1425 0.1003 ...
##  $ V6   : num  0.2776 0.0786 0.1599 0.2839 0.1328 ...
##  $ V7   : num  0.3001 0.0869 0.1974 0.2414 0.198 ...
##  $ V8   : num  0.1471 0.0702 0.1279 0.1052 0.1043 ...
##  $ V9   : num  0.242 0.181 0.207 0.26 0.181 ...
##  $ V10  : num  0.0787 0.0567 0.06 0.0974 0.0588 ...
##  $ V11  : num  1.095 0.543 0.746 0.496 0.757 ...
##  $ V12  : num  0.905 0.734 0.787 1.156 0.781 ...
##  $ V13  : num  8.59 3.4 4.58 3.44 5.44 ...
##  $ V14  : num  153.4 74.1 94 27.2 94.4 ...
##  $ V15  : num  0.0064 0.00522 0.00615 0.00911 0.01149 ...
##  $ V16  : num  0.049 0.0131 0.0401 0.0746 0.0246 ...
##  $ V17  : num  0.0537 0.0186 0.0383 0.0566 0.0569 ...
##  $ V18  : num  0.0159 0.0134 0.0206 0.0187 0.0188 ...
##  $ V19  : num  0.03 0.0139 0.0225 0.0596 0.0176 ...
##  $ V20  : num  0.00619 0.00353 0.00457 0.00921 0.00511 ...
##  $ V21  : num  25.4 25 23.6 14.9 22.5 ...
##  $ V22  : num  17.3 23.4 25.5 26.5 16.7 ...
##  $ V23  : num  184.6 158.8 152.5 98.9 152.2 ...
##  $ V24  : num  2019 1956 1709 568 1575 ...
##  $ V25  : num  0.162 0.124 0.144 0.21 0.137 ...
##  $ V26  : num  0.666 0.187 0.424 0.866 0.205 ...
##  $ V27  : num  0.712 0.242 0.45 0.687 0.4 ...
##  $ V28  : num  0.265 0.186 0.243 0.258 0.163 ...
##  $ V29  : num  0.46 0.275 0.361 0.664 0.236 ...
##  $ V30  : num  0.1189 0.089 0.0876 0.173 0.0768 ...
##  $ Class: int  2 2 2 2 2 2 2 2 2 2 ...
sum(is.na(cancer))
## [1] 0

Data lengkap, tidak ada missing value

kor<-cor(cancer[,-31])
corrplot(kor)

Berdasarkan plot korelasi beberapa peubah saling berkorelasi (multikolinearitas). Misalkan data tersebut dimodelkan dengan regresi logistik.

cancer$Class<-factor(cancer$Class)
logistik<-glm(Class~., data=cancer, family='binomial')
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistik)
## 
## Call:
## glm(formula = Class ~ ., family = "binomial", data = cancer)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -8.49   -8.49   -8.49    8.49    8.49  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.531e+06  2.816e+05  -8.988  < 2e-16 ***
## V1           2.131e+06  2.693e+05   7.916 2.45e-15 ***
## V2           1.720e+05  1.471e+04  11.694  < 2e-16 ***
## V3           1.294e+06  2.464e+04  52.522  < 2e-16 ***
## V4          -1.143e+05  3.907e+03 -29.251  < 2e-16 ***
## V5          -1.339e+08  8.361e+06 -16.016  < 2e-16 ***
## V6          -5.650e+06  3.213e+06  -1.759  0.07865 .  
## V7           9.166e+05  1.408e+06   0.651  0.51519    
## V8          -1.507e+07  5.382e+06  -2.800  0.00511 ** 
## V9           3.556e+07  7.772e+05  45.757  < 2e-16 ***
## V10         -3.718e+07  2.169e+06 -17.143  < 2e-16 ***
## V11          2.924e+07  1.169e+06  25.015  < 2e-16 ***
## V12          5.594e+06  2.005e+05  27.899  < 2e-16 ***
## V13          1.494e+06  4.720e+04  31.649  < 2e-16 ***
## V14         -5.616e+05  1.835e+04 -30.602  < 2e-16 ***
## V15          6.580e+08  1.224e+07  53.769  < 2e-16 ***
## V16         -1.557e+08  5.732e+06 -27.168  < 2e-16 ***
## V17          1.343e+08  5.340e+06  25.142  < 2e-16 ***
## V18         -1.107e+09  4.012e+07 -27.579  < 2e-16 ***
## V19          2.539e+08  4.125e+06  61.535  < 2e-16 ***
## V20          1.328e+09  6.597e+07  20.133  < 2e-16 ***
## V21         -5.385e+06  2.143e+05 -25.127  < 2e-16 ***
## V22         -5.123e+05  2.437e+04 -21.023  < 2e-16 ***
## V23         -3.108e+05  1.219e+04 -25.493  < 2e-16 ***
## V24          7.862e+04  2.741e+03  28.685  < 2e-16 ***
## V25         -1.898e+07  3.298e+06  -5.756 8.63e-09 ***
## V26          7.893e+06  3.999e+05  19.737  < 2e-16 ***
## V27         -2.660e+07  1.523e+06 -17.458  < 2e-16 ***
## V28          1.257e+08  5.471e+06  22.980  < 2e-16 ***
## V29         -2.173e+07  3.392e+05 -64.055  < 2e-16 ***
## V30         -3.248e+07  5.340e+06  -6.083 1.18e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:   751.44  on 568  degrees of freedom
## Residual deviance: 32006.76  on 538  degrees of freedom
## AIC: 32069
## 
## Number of Fisher Scoring iterations: 25

Terdapat Warning bahwa algoritma yang digunakan tidak konvergen, sehingga perlu dilakukan transformasi data.

boxplot(cancer[,-31])

Antar peubah dalam data memiliki skala yang berbeda, sehingga perlu dilakukan standarisasi.

cancer_scale<-scale(cancer[,-31])
cbind(eigen(cov(cancer_scale))$values,
      eigen(cor(cancer[,-31]))$values)
##               [,1]         [,2]
##  [1,] 1.328161e+01 1.328161e+01
##  [2,] 5.691355e+00 5.691355e+00
##  [3,] 2.817949e+00 2.817949e+00
##  [4,] 1.980640e+00 1.980640e+00
##  [5,] 1.648730e+00 1.648730e+00
##  [6,] 1.207357e+00 1.207357e+00
##  [7,] 6.752201e-01 6.752201e-01
##  [8,] 4.766171e-01 4.766171e-01
##  [9,] 4.168948e-01 4.168948e-01
## [10,] 3.506934e-01 3.506934e-01
## [11,] 2.939157e-01 2.939157e-01
## [12,] 2.611613e-01 2.611613e-01
## [13,] 2.413576e-01 2.413576e-01
## [14,] 1.570097e-01 1.570097e-01
## [15,] 9.413496e-02 9.413496e-02
## [16,] 7.986280e-02 7.986280e-02
## [17,] 5.939903e-02 5.939903e-02
## [18,] 5.261879e-02 5.261879e-02
## [19,] 4.947759e-02 4.947759e-02
## [20,] 3.115940e-02 3.115940e-02
## [21,] 2.997289e-02 2.997289e-02
## [22,] 2.743943e-02 2.743943e-02
## [23,] 2.434083e-02 2.434083e-02
## [24,] 1.805501e-02 1.805501e-02
## [25,] 1.548127e-02 1.548127e-02
## [26,] 8.177641e-03 8.177641e-03
## [27,] 6.900464e-03 6.900464e-03
## [28,] 1.589338e-03 1.589338e-03
## [29,] 7.488031e-04 7.488031e-04
## [30,] 1.330448e-04 1.330448e-04

Nilai eigen (akar ciri) dari matriks korelasi data asli dan matriks peragam data terstandarisasi memiliki nilai yang sama.

hasilpca<-prcomp(cancer[,-31], scale. = T)
head(hasilpca$sdev^2,n=5)
## [1] 13.281608  5.691355  2.817949  1.980640  1.648730
plot(hasilpca$sdev^2, type = 'o',
     main = 'Scree Plot',
     ylab='')

Untuk menentukan banyaknya komponen utama yang akan digunakan, terdapat beberapa opsi kriteria yang dapat digunakan:

  • Persentase keragaman kumulatifnya >70%
  • Akar ciri>1
  • Elbow rule, pada scree plot dipilih titik yang sebelah kirinya curam tetapi sebelah kanan landai
  • Tetap menggunakan semua peubah/ komponen utama jika tujuannya hanya untuk merotasi data, misalnya Rotation Forest
perbandingan<-data.frame(
  'Lambda'=round(hasilpca$sdev^2,4),
  'Pctn'=round(hasilpca$sdev^2/sum(hasilpca$sdev^2)*100,2),
  'CumPctn'=cumsum(round(hasilpca$sdev^2/sum(hasilpca$sdev^2)*100,2)))
perbandingan
##     Lambda  Pctn CumPctn
## 1  13.2816 44.27   44.27
## 2   5.6914 18.97   63.24
## 3   2.8179  9.39   72.63
## 4   1.9806  6.60   79.23
## 5   1.6487  5.50   84.73
## 6   1.2074  4.02   88.75
## 7   0.6752  2.25   91.00
## 8   0.4766  1.59   92.59
## 9   0.4169  1.39   93.98
## 10  0.3507  1.17   95.15
## 11  0.2939  0.98   96.13
## 12  0.2612  0.87   97.00
## 13  0.2414  0.80   97.80
## 14  0.1570  0.52   98.32
## 15  0.0941  0.31   98.63
## 16  0.0799  0.27   98.90
## 17  0.0594  0.20   99.10
## 18  0.0526  0.18   99.28
## 19  0.0495  0.16   99.44
## 20  0.0312  0.10   99.54
## 21  0.0300  0.10   99.64
## 22  0.0274  0.09   99.73
## 23  0.0243  0.08   99.81
## 24  0.0181  0.06   99.87
## 25  0.0155  0.05   99.92
## 26  0.0082  0.03   99.95
## 27  0.0069  0.02   99.97
## 28  0.0016  0.01   99.98
## 29  0.0007  0.00   99.98
## 30  0.0001  0.00   99.98

Jika digunakan kriteria pertama persentase keragaman kumulatif >70, maka diambil 3 komponen utama yang memiliki persentase keragaman kumulatif 72.63%.

cancerpca<-data.frame(cancer_scale%*%hasilpca$rotation[,1:3])
korelasi<-cor(cancerpca)
corrplot(korelasi)

Setelah dilakukan transformasi peubah dengan PCA, antar peubah tidak lagi saling berkorelasi.

plot_ly(cancerpca,x=~PC1, y=~PC2, z=~PC3)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Selanjutnya dilakukan pemodelan dengan model logistik pada data yang sudah ditransformasi.

cancerpca$Class<-cancer$Class

set.seed(123)
idx<-sample.int(nrow(cancerpca), size = round(0.3*nrow(cancerpca)))
pcatrain<-cancerpca[-idx,]
pcatest<-cancerpca[idx,]

logistik_pca<-glm(Class~., data=pcatrain, family = 'binomial')
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistik_pca)
## 
## Call:
## glm(formula = Class ~ ., family = "binomial", data = pcatrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5732  -0.0493  -0.0045   0.0028   3.7972  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.8405     0.3690  -2.278   0.0227 *  
## PC1          -2.7960     0.5268  -5.308 1.11e-07 ***
## PC2           1.4730     0.3429   4.296 1.74e-05 ***
## PC3          -0.4621     0.2431  -1.901   0.0574 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 533.88  on 397  degrees of freedom
## Residual deviance:  69.94  on 394  degrees of freedom
## AIC: 77.94
## 
## Number of Fisher Scoring iterations: 9
predik<-predict(logistik_pca, pcatest,type='response')
prediksi<-as.factor(ifelse(predik>0.5,'2','1'))
confusionMatrix(prediksi,pcatest$Class, positive='2')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 113   7
##          2   3  48
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.8951, 0.9716)
##     No Information Rate : 0.6784          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8634          
##                                           
##  Mcnemar's Test P-Value : 0.3428          
##                                           
##             Sensitivity : 0.8727          
##             Specificity : 0.9741          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.9417          
##              Prevalence : 0.3216          
##          Detection Rate : 0.2807          
##    Detection Prevalence : 0.2982          
##       Balanced Accuracy : 0.9234          
##                                           
##        'Positive' Class : 2               
## 

Data Gambar

setwd("D:/MAGISTER (S2)/STA582 (Pembelajaran Mesin Statistika)/pca")
library(imager)

e1 <- as.data.frame(load.image("e1.png"))
e2 <- as.data.frame(load.image("e2.png"))
e3 <- as.data.frame(load.image("e3.png"))
e4 <- as.data.frame(load.image("e4.png"))
e5 <- as.data.frame(load.image("e5.png"))

f1 <- as.data.frame(load.image("f1.png"))
f2 <- as.data.frame(load.image("f2.png"))
f3 <- as.data.frame(load.image("f3.png"))
f4 <- as.data.frame(load.image("f4.png"))
f5 <- as.data.frame(load.image("f5.png"))

g1 <- as.data.frame(load.image("g1.png"))
g2 <- as.data.frame(load.image("g2.png"))
g3 <- as.data.frame(load.image("g3.png"))
g4 <- as.data.frame(load.image("g4.png"))
g5 <- as.data.frame(load.image("g5.png"))

o1 <- as.data.frame(load.image("o1.png"))
o2 <- as.data.frame(load.image("o2.png"))
o3 <- as.data.frame(load.image("o3.png"))
o4 <- as.data.frame(load.image("o4.png"))
o5 <- as.data.frame(load.image("o5.png"))


huruf <- cbind(e1[,4],e2[,4],e3[,4],e4[,4],e5[,4],
               f1[,4],f2[,4],f3[,4],f4[,4],f5[,4],
               g1[,4],g2[,4],g3[,4],g4[,4],g5[,4],
               o1[,4],o2[,4],o3[,4],o4[,4],o5[,4])
huruf <- t(huruf)
nk <- ncol(huruf)
colnames(huruf) <- paste("x",1:nk, sep="")
mydt <- as.matrix(huruf)
#mydt <- as.matrix(huruf[,1:256])
alf <- rep(c("e","f","g","o"),each=5)
cv<-cov(mydt)
ecv <-eigen(cv)
ecv$values[1:10]
##  [1] 88.7823252  7.9526159  7.1242310  5.2959714  3.3654987  2.4273075
##  [7]  1.4991734  1.1055020  0.8341791  0.7404901
proporsi<-ecv$values/sum(ecv$values)
proporsi[1:10]*100
##  [1] 73.7088628  6.6024208  5.9146791  4.3968214  2.7941044  2.0151993
##  [7]  1.2446437  0.9178099  0.6925522  0.6147697
cumsum(proporsi[1:15]) #cumulative sum
##  [1] 0.7370886 0.8031128 0.8622596 0.9062278 0.9341689 0.9543209 0.9667673
##  [8] 0.9759454 0.9828709 0.9890186 0.9932904 0.9971328 0.9990780 1.0000000
## [15] 1.0000000
xx <- seq(0,4,by=0.5)
yy <- seq(0,3,by=0.5)

pca <- mydt %*% ecv$vectors
plot(pca[,1:2],pch=alf,col=rep(1:4,each=5),cex=1.5,xlab="PC1",ylab="PC2")

library(rgl)
mycolors <- c('black', 'red', 'green', "blue")
dta <- as.data.frame(pca[,1:3])
colnames(dta) <- c("PC1","PC2","PC3")
dta$warna <- alf
warna <- mycolors[ col=rep(1:4,each=5) ]

plot_ly(dta,x=~PC1, y=~PC2, z=~PC3,
        color=warna)
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode