# Import data
data <- read.csv("hcvdat0.csv")
str(data)
## 'data.frame': 615 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Category: chr "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" "0=Blood Donor" ...
## $ Age : int 32 32 32 32 32 32 32 32 32 32 ...
## $ Sex : chr "m" "m" "m" "m" ...
## $ ALB : num 38.5 38.5 46.9 43.2 39.2 41.6 46.3 42.2 50.9 42.4 ...
## $ ALP : num 52.5 70.3 74.7 52 74.1 43.3 41.3 41.9 65.5 86.3 ...
## $ ALT : num 7.7 18 36.2 30.6 32.6 18.5 17.5 35.8 23.2 20.3 ...
## $ AST : num 22.1 24.7 52.6 22.6 24.8 19.7 17.8 31.1 21.2 20 ...
## $ BIL : num 7.5 3.9 6.1 18.9 9.6 12.3 8.5 16.1 6.9 35.2 ...
## $ CHE : num 6.93 11.17 8.84 7.33 9.15 ...
## $ CHOL : num 3.23 4.8 5.2 4.74 4.32 6.05 4.79 4.6 4.1 4.45 ...
## $ CREA : num 106 74 86 80 76 111 70 109 83 81 ...
## $ GGT : num 12.1 15.6 33.2 33.8 29.9 91 16.9 21.5 13.7 15.9 ...
## $ PROT : num 69 76.5 79.3 75.7 68.7 74 74.5 67.1 71.3 69.9 ...
summary(data)
## X Category Age Sex
## Min. : 1.0 Length:615 Min. :19.00 Length:615
## 1st Qu.:154.5 Class :character 1st Qu.:39.00 Class :character
## Median :308.0 Mode :character Median :47.00 Mode :character
## Mean :308.0 Mean :47.41
## 3rd Qu.:461.5 3rd Qu.:54.00
## Max. :615.0 Max. :77.00
##
## ALB ALP ALT AST
## Min. :14.90 Min. : 11.30 Min. : 0.90 Min. : 10.60
## 1st Qu.:38.80 1st Qu.: 52.50 1st Qu.: 16.40 1st Qu.: 21.60
## Median :41.95 Median : 66.20 Median : 23.00 Median : 25.90
## Mean :41.62 Mean : 68.28 Mean : 28.45 Mean : 34.79
## 3rd Qu.:45.20 3rd Qu.: 80.10 3rd Qu.: 33.08 3rd Qu.: 32.90
## Max. :82.20 Max. :416.60 Max. :325.30 Max. :324.00
## NA's :1 NA's :18 NA's :1
## BIL CHE CHOL CREA
## Min. : 0.8 Min. : 1.420 Min. :1.430 Min. : 8.00
## 1st Qu.: 5.3 1st Qu.: 6.935 1st Qu.:4.610 1st Qu.: 67.00
## Median : 7.3 Median : 8.260 Median :5.300 Median : 77.00
## Mean : 11.4 Mean : 8.197 Mean :5.368 Mean : 81.29
## 3rd Qu.: 11.2 3rd Qu.: 9.590 3rd Qu.:6.060 3rd Qu.: 88.00
## Max. :254.0 Max. :16.410 Max. :9.670 Max. :1079.10
## NA's :10
## GGT PROT
## Min. : 4.50 Min. :44.80
## 1st Qu.: 15.70 1st Qu.:69.30
## Median : 23.30 Median :72.20
## Mean : 39.53 Mean :72.04
## 3rd Qu.: 40.20 3rd Qu.:75.40
## Max. :650.90 Max. :90.00
## NA's :1
data_num <- data[, sapply(data, is.numeric)]
str(data_num)
## 'data.frame': 615 obs. of 12 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 32 32 32 32 32 32 32 32 32 ...
## $ ALB : num 38.5 38.5 46.9 43.2 39.2 41.6 46.3 42.2 50.9 42.4 ...
## $ ALP : num 52.5 70.3 74.7 52 74.1 43.3 41.3 41.9 65.5 86.3 ...
## $ ALT : num 7.7 18 36.2 30.6 32.6 18.5 17.5 35.8 23.2 20.3 ...
## $ AST : num 22.1 24.7 52.6 22.6 24.8 19.7 17.8 31.1 21.2 20 ...
## $ BIL : num 7.5 3.9 6.1 18.9 9.6 12.3 8.5 16.1 6.9 35.2 ...
## $ CHE : num 6.93 11.17 8.84 7.33 9.15 ...
## $ CHOL: num 3.23 4.8 5.2 4.74 4.32 6.05 4.79 4.6 4.1 4.45 ...
## $ CREA: num 106 74 86 80 76 111 70 109 83 81 ...
## $ GGT : num 12.1 15.6 33.2 33.8 29.9 91 16.9 21.5 13.7 15.9 ...
## $ PROT: num 69 76.5 79.3 75.7 68.7 74 74.5 67.1 71.3 69.9 ...
# Correlation Matrix
cor_matrix <- cor(data_num, use = "complete.obs")
cor_matrix
## X Age ALB ALP ALT AST
## X 1.00000000 0.44305790 -0.315204550 0.01794376 -0.20023304 0.30360292
## Age 0.44305790 1.00000000 -0.191093637 0.17771977 -0.04057647 0.07273886
## ALB -0.31520455 -0.19109364 1.000000000 -0.14611991 0.03949714 -0.17760895
## ALP 0.01794376 0.17771977 -0.146119911 1.00000000 0.22160301 0.06702428
## ALT -0.20023304 -0.04057647 0.039497139 0.22160301 1.00000000 0.19865775
## AST 0.30360292 0.07273886 -0.177608947 0.06702428 0.19865775 1.00000000
## BIL 0.17651109 0.03965486 -0.169597498 0.05837241 -0.10679662 0.30957974
## CHE -0.27853454 -0.07586328 0.360919403 0.02948169 0.22434447 -0.19727042
## CHOL -0.05794709 0.12474161 0.210419878 0.12590008 0.14999727 -0.20121300
## CREA -0.02016270 -0.02514225 0.001433247 0.15390895 -0.03610554 -0.01794810
## GGT 0.22146275 0.14337927 -0.147598318 0.46130000 0.21970686 0.47777362
## PROT -0.16648242 -0.15975998 0.570725680 -0.06308514 0.01678633 0.01740394
## BIL CHE CHOL CREA GGT PROT
## X 0.17651109 -0.27853454 -0.057947087 -0.020162704 0.221462754 -0.16648242
## Age 0.03965486 -0.07586328 0.124741615 -0.025142253 0.143379268 -0.15975998
## ALB -0.16959750 0.36091940 0.210419878 0.001433247 -0.147598318 0.57072568
## ALP 0.05837241 0.02948169 0.125900079 0.153908950 0.461299996 -0.06308514
## ALT -0.10679662 0.22434447 0.149997271 -0.036105541 0.219706857 0.01678633
## AST 0.30957974 -0.19727042 -0.201213004 -0.017948098 0.477773617 0.01740394
## BIL 1.00000000 -0.32071323 -0.181569556 0.019909617 0.210566559 -0.05257491
## CHE -0.32071323 1.00000000 0.428018276 -0.012119999 -0.095716131 0.30628754
## CHOL -0.18156956 0.42801828 1.000000000 -0.051464078 0.008822692 0.24504950
## CREA 0.01990962 -0.01212000 -0.051464078 1.000000000 0.125353469 -0.03011070
## GGT 0.21056656 -0.09571613 0.008822692 0.125353469 1.000000000 -0.03712701
## PROT -0.05257491 0.30628754 0.245049503 -0.030110695 -0.037127008 1.00000000
Correlation matrix digunakan untuk melihat hubungan antar variabel numerik dalam data. Nilai korelasi berada pada rentang -1 sampai 1. Jika nilainya mendekati 1, berarti kedua variabel memiliki hubungan yang kuat dan searah. Jika mendekati -1, berarti hubungannya kuat tetapi berlawanan arah. Nilai yang mendekati 0 menunjukkan bahwa hubungan antar variabel lemah atau hampir tidak ada. Dari hasil matriks korelasi, dapat diketahui variabel mana yang salling berkaitan dan mana yang relatif independen.
# Covariance matrix
cov_matrix <- cov(data_num, use = "complete.obs")
cov_matrix
## X Age ALB ALP ALT AST
## X 30325.61267 766.254175 -316.2678135 80.997414 -727.477884 1737.677172
## Age 766.25418 98.631388 -10.9348172 45.750544 -8.407388 23.742827
## ALB -316.26781 -10.934817 33.1982701 -21.823283 4.747912 -33.634186
## ALP 80.99741 45.750544 -21.8232826 671.901949 119.841675 57.100968
## ALT -727.47788 -8.407388 4.7479116 119.841675 435.269784 136.220708
## AST 1737.67717 23.742827 -33.6341863 57.100968 136.220708 1080.231200
## BIL 535.04466 6.855155 -17.0094554 26.337454 -38.783770 177.110426
## CHE -106.27733 -1.650806 4.5564303 1.674411 10.255372 -14.206173
## CHOL -11.39233 1.398606 1.3687395 3.684302 3.532962 -7.466047
## CREA -178.00646 -12.658841 0.4186596 202.254881 -38.188738 -29.906045
## GGT 2094.23092 77.323769 -46.1804560 649.315069 248.909775 852.706557
## PROT -155.07302 -8.486697 17.5892871 -8.746677 1.873261 3.059630
## BIL CHE CHOL CREA GGT PROT
## X 535.044661 -106.277326 -11.3923339 -178.0064562 2094.2309247 -155.073024
## Age 6.855155 -1.650806 1.3986055 -12.6588412 77.3237685 -8.486697
## ALB -17.009455 4.556430 1.3687395 0.4186596 -46.1804560 17.589287
## ALP 26.337454 1.674411 3.6843023 202.2548814 649.3150694 -8.746677
## ALT -38.783770 10.255372 3.5329616 -38.1887382 248.9097752 1.873261
## AST 177.110426 -14.206173 -7.4660468 -29.9060449 852.7065571 3.059630
## BIL 302.988734 -12.231702 -3.5680635 17.5694569 199.0314564 -4.895025
## CHE -12.231702 4.800799 1.0587548 -1.3462991 -11.3883550 3.589626
## CHOL -3.568064 1.058755 1.2745375 -2.9455248 0.5408745 1.479767
## CREA 17.569457 -1.346299 -2.9455248 2570.1849279 345.0941704 -8.165186
## GGT 199.031456 -11.388355 0.5408745 345.0941704 2948.7514092 -10.783808
## PROT -4.895025 3.589626 1.4797666 -8.1651857 -10.7838076 28.610549
Matriks covariance matrix menunjukkan seberapa besar penyebaran data pada setiap variabel serta bagaimana dua variabel berubah secara bersamaan. Nilai pada diagonal matriks merupakan varians masing-masing variabel yang menunjukkan seberapa besar variasi data tersebut. Nilai di luar diagonal merupakan kovariansi yang menunjukkan hubungan perubahan antar dua variabel. Kovariansi bernilai positif berarti kedua variabel cenderung meningkat bersama, sedangkan nilai negatif menunjukkan arah perubahan yang berlawanan.
# Eigen value dan eigen vector
eigen_result <- eigen(cov_matrix)
eigen_result$values
## [1] 3.064615e+04 3.411952e+03 2.431355e+03 8.106778e+02 4.759857e+02
## [6] 3.597636e+02 2.295242e+02 7.550312e+01 4.386454e+01 1.225674e+01
## [11] 3.543169e+00 8.832592e-01
eigen_result$vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.9943168408 0.0781794505 -0.0428895929 -0.0328121735 0.020212889
## [2,] 0.0252084504 -0.0048342063 0.0049027524 -0.0592379614 0.060040370
## [3,] -0.0104849795 0.0082864828 -0.0043684634 -0.0004021013 -0.025903024
## [4,] 0.0044080667 -0.2356681437 0.0356393108 -0.4374522941 0.710618877
## [5,] -0.0230437543 -0.1041992516 0.0916985555 0.0673718565 0.547112440
## [6,] 0.0607143554 -0.2580042619 0.1999667225 0.8489173365 0.256978864
## [7,] 0.0184370838 -0.0576707031 0.0297876343 0.1542686413 -0.079875667
## [8,] -0.0035236442 0.0013755262 -0.0001678846 -0.0100541902 0.011976556
## [9,] -0.0003865148 0.0002186455 0.0009124350 -0.0103444081 0.005339638
## [10,] -0.0053562951 -0.4023281847 -0.9087666493 0.1021482314 0.004426586
## [11,] 0.0771038647 -0.8340483849 0.3488376959 -0.2101014484 -0.343945278
## [12,] -0.0050747796 0.0005044839 0.0043774244 0.0163142811 -0.009811142
## [,6] [,7] [,8] [,9] [,10]
## [1,] -0.029899000 0.0117861445 0.028276257 -0.0075378056 0.002522380
## [2,] 0.013200918 -0.0142637105 -0.987882489 -0.1224848387 -0.020137828
## [3,] -0.041230519 -0.0196324002 0.073071176 -0.7117277819 0.693190933
## [4,] 0.462008490 -0.1601735463 0.082208949 -0.0250189827 0.017575507
## [5,] -0.680285668 0.4612712030 0.014509161 -0.0004604439 -0.009521025
## [6,] 0.171499060 -0.2691408215 -0.025811818 -0.0013787705 0.020143047
## [7,] 0.524197933 0.8290819581 -0.013118146 -0.0459489418 0.009776830
## [8,] -0.030163171 -0.0150483056 0.002265340 -0.1084487331 -0.017859713
## [9,] -0.009869622 -0.0007455209 -0.011252758 -0.0491322591 -0.030609508
## [10,] -0.039663866 0.0110805318 -0.008986684 -0.0015590011 -0.003436025
## [11,] -0.119064616 0.0091672413 -0.002651554 0.0038121325 -0.002258407
## [12,] -0.006258120 -0.0244543592 0.099611920 -0.6793031525 -0.718958094
## [,11] [,12]
## [1,] 1.166495e-03 -0.0005701834
## [2,] -1.638485e-02 -0.0153975609
## [3,] -6.807153e-02 0.0016019737
## [4,] -4.373501e-03 -0.0027458982
## [5,] -2.126558e-02 -0.0043095508
## [6,] 9.352837e-03 0.0072123962
## [7,] 2.699176e-02 -0.0001634271
## [8,] 9.680597e-01 -0.2222163944
## [9,] 2.170998e-01 0.9742461763
## [10,] 4.016040e-04 0.0012264068
## [11,] 6.161861e-05 -0.0015769054
## [12,] -9.765782e-02 -0.0337955255
Eigen value menunjukkan seberapa besar variasi data yang dapat dijelaskan oleh setiap komponen utama. Semakin besar nilai eigen value, semakin besar peran komponen tersebut dalam merepresentasikan data. Eigen vector menunjukkan kontribusi masing-masing variabel terhadap komponen utama.Hasil ini dapat membantu untuk memahami struktur data dan melihat variabel mana yang paling berpengaruh tanpa harus melihat semua variabel satu per satu.