##Upload Library untuk Visualisasi pada Correation Matrix

library(ggplot2)
library(reshape2)

Memasukkan Dataset

Pada tahap ini, dilakukannya menginput dataset yang berisikan fitur numerik dan non-numerik. Tujuannya adalah dapat dilakukannya analisis pada dataset yang akan dianalisis.

dataset_full <- data.frame(
  Gender = c(1, 0, 0, 1, 0),  
  Age = c(30.83, 58.67, 24.50, 27.83, 20.17),  
  Debt = c(0.000, 4.460, 0.500, 1.540, 5.625),  
  Married = c(1, 1, 1, 1, 1),  
  BankCustomer = c(1, 1, 1, 1, 1),  
  Industry = c("Industrials", "Materials", "Materials", "Industrials", "Industrials"),  
  YearsEmployed = c(1.25, 3.04, 1.50, 3.75, 1.71),  
  PriorDefault = c(1, 1, 1, 1, 1),  
  Employed = c(1, 1, 0, 1, 0),  
  CreditScore = c(1, 6, 0, 5, 0),  
  DriversLicense = c(0, 0, 0, 1, 0),  
  Citizen = c("ByBirth", "ByBirth", "ByBirth", "ByBirth", "ByOtherMeans"),  
  ZipCode = c(202, 43, 280, 100, 120),  
  Income = c(0, 560, 824, 3, 0),  
  Approved = c(1, 1, 1, 1, 1)  
)

print("Dataset:")
## [1] "Dataset:"
print(dataset_full)
##   Gender   Age  Debt Married BankCustomer    Industry YearsEmployed
## 1      1 30.83 0.000       1            1 Industrials          1.25
## 2      0 58.67 4.460       1            1   Materials          3.04
## 3      0 24.50 0.500       1            1   Materials          1.50
## 4      1 27.83 1.540       1            1 Industrials          3.75
## 5      0 20.17 5.625       1            1 Industrials          1.71
##   PriorDefault Employed CreditScore DriversLicense      Citizen ZipCode Income
## 1            1        1           1              0      ByBirth     202      0
## 2            1        1           6              0      ByBirth      43    560
## 3            1        0           0              0      ByBirth     280    824
## 4            1        1           5              1      ByBirth     100      3
## 5            1        0           0              0 ByOtherMeans     120      0
##   Approved
## 1        1
## 2        1
## 3        1
## 4        1
## 5        1

Menyeleksi Fitur pada Dataset, dengan Memilih Hanya Fitur Numerik

Setelah dilakukannya menginput dataset, disini diperlukan untuk menyeleksi untuk memilih fitur yang berisi angka numerik karena untuk digunakan dalam perhitungan matematis seperti eigen values, covariance, dan correlation matrix

dataset_numeric <- dataset_full[sapply(dataset_full, is.numeric)]

print("Data setelah hanya berisi fitur numerik:")
## [1] "Data setelah hanya berisi fitur numerik:"
print(dataset_numeric)
##   Gender   Age  Debt Married BankCustomer YearsEmployed PriorDefault Employed
## 1      1 30.83 0.000       1            1          1.25            1        1
## 2      0 58.67 4.460       1            1          3.04            1        1
## 3      0 24.50 0.500       1            1          1.50            1        0
## 4      1 27.83 1.540       1            1          3.75            1        1
## 5      0 20.17 5.625       1            1          1.71            1        0
##   CreditScore DriversLicense ZipCode Income Approved
## 1           1              0     202      0        1
## 2           6              0      43    560        1
## 3           0              0     280    824        1
## 4           5              1     100      3        1
## 5           0              0     120      0        1

Menghitung, variance-covariance matrix, Eigen Values, dan Eigen Vectors

Eigen values dan eigen vectors digunakan dalam analisis dimensi dan dalam reduksi data, terutama dalam PCA (Principal Component Analysis). Eigen values itu menggambarkan mengenai besarnya informasi yang bisa dijelaskan oleh tiap dimensi dalam dataset. Sedangkan, Eigen vectors itu menunjukan mengenai arah dari dimensi utama dalam dataset. Sebelum dilakukannya menghitung eigen values dan vectors, perlu dilakukannya untuk menghitung variance-covariance matrix. variance-covariance matrix digunakan untuk melihat seberapa besar hubungan antar variabel dalam datasetnya.

cov_matrix <- cov(dataset_numeric)
print("Variance-Covariance Matrix:")
## [1] "Variance-Covariance Matrix:"
print(cov_matrix)
##                   Gender         Age        Debt Married BankCustomer
## Gender            0.3000   -1.535000   -0.827500       0            0
## Age              -1.5350  231.361400    9.345663       0            0
## Debt             -0.8275    9.345663    6.187675       0            0
## Married           0.0000    0.000000    0.000000       0            0
## BankCustomer      0.0000    0.000000    0.000000       0            0
## YearsEmployed     0.1250    6.999375    0.605225       0            0
## PriorDefault      0.0000    0.000000    0.000000       0            0
## Employed          0.2000    5.032500   -0.318750       0            0
## CreditScore       0.3000   33.300000    1.340000       0            0
## DriversLicense    0.1500   -1.142500   -0.221250       0            0
## ZipCode           1.0000 -831.032500 -161.461250       0            0
## Income         -137.9500 2046.972500 -112.313750       0            0
## Approved          0.0000    0.000000    0.000000       0            0
##                YearsEmployed PriorDefault  Employed CreditScore DriversLicense
## Gender              0.125000            0   0.20000        0.30        0.15000
## Age                 6.999375            0   5.03250       33.30       -1.14250
## Debt                0.605225            0  -0.31875        1.34       -0.22125
## Married             0.000000            0   0.00000        0.00        0.00000
## BankCustomer        0.000000            0   0.00000        0.00        0.00000
## YearsEmployed       1.182050            0   0.32250        2.81        0.37500
## PriorDefault        0.000000            0   0.00000        0.00        0.00000
## Employed            0.322500            0   0.30000        1.20        0.10000
## CreditScore         2.810000            0   1.20000        8.30        0.65000
## DriversLicense      0.375000            0   0.10000        0.65        0.20000
## ZipCode           -73.207500            0 -25.50000     -207.00      -12.25000
## Income            -42.775000            0 -67.30000       11.55      -68.60000
## Approved            0.000000            0   0.00000        0.00        0.00000
##                   ZipCode      Income Approved
## Gender             1.0000   -137.9500        0
## Age             -831.0325   2046.9725        0
## Debt            -161.4613   -112.3137        0
## Married            0.0000      0.0000        0
## BankCustomer       0.0000      0.0000        0
## YearsEmployed    -73.2075    -42.7750        0
## PriorDefault       0.0000      0.0000        0
## Employed         -25.5000    -67.3000        0
## CreditScore     -207.0000     11.5500        0
## DriversLicense   -12.2500    -68.6000        0
## ZipCode         8612.0000  12109.2500        0
## Income         12109.2500 151957.8000        0
## Approved           0.0000      0.0000        0
eigen_calculate <- eigen(cov_matrix)

print("Eigen Values:")
## [1] "Eigen Values:"
print(eigen_calculate $values)
##  [1]  1.529993e+05  7.738337e+03  7.516727e+01  4.867587e+00  0.000000e+00
##  [6] -1.733029e-29 -4.791007e-16 -5.033986e-16 -7.406715e-16 -1.918058e-14
## [11] -2.260795e-14 -2.521282e-13 -1.110098e-12
print("Eigen Vectors:")
## [1] "Eigen Vectors:"
print(eigen_calculate $vectors)
##                [,1]          [,2]          [,3]          [,4] [,5]
##  [1,]  8.979963e-04  1.599479e-03  2.669568e-02  1.456467e-01    0
##  [2,] -1.289705e-02 -1.315601e-01  9.779281e-01 -1.368644e-01    0
##  [3,]  8.188070e-04 -1.960531e-02 -1.280374e-01 -6.212193e-01    0
##  [4,]  0.000000e+00 -2.775558e-17 -9.992007e-16  2.886580e-15    0
##  [5,]  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00    0
##  [6,]  3.179461e-04 -9.024786e-03 -1.829859e-02  3.240546e-01    0
##  [7,]  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00    0
##  [8,]  4.517914e-04 -2.638175e-03  4.507633e-02  1.130247e-01    0
##  [9,]  3.495695e-05 -2.713553e-02  8.404525e-02  6.522530e-01    0
## [10,]  4.535475e-04 -8.272715e-04 -1.323359e-02  1.755846e-01    0
## [11,] -8.349356e-02  9.873753e-01  1.280421e-01 -9.271224e-03    0
## [12,] -9.964239e-01 -8.105261e-02 -2.345634e-02  2.426576e-03    0
## [13,]  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00    1
##                [,6]          [,7]          [,8]          [,9]         [,10]
##  [1,]  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
##  [2,]  3.173865e-17  2.211582e-04  5.827793e-05  1.467290e-03  1.019239e-02
##  [3,]  1.525036e-15  1.845961e-03  5.438426e-03  5.448677e-03 -6.073995e-03
##  [4,]  2.172232e-14  1.263052e-02  8.046240e-02 -6.975346e-02 -9.826273e-01
##  [5,] -1.310497e-14 -9.786714e-01  1.359410e-01  1.535353e-01 -1.213435e-02
##  [6,]  1.504406e-13  5.511389e-02  5.737888e-01 -1.505700e-01  1.587244e-01
##  [7,]  1.000000e+00 -4.964085e-14 -2.584517e-13 -3.080869e-15  5.273559e-16
##  [8,]  1.685824e-13  1.750799e-01  6.117721e-01  5.735624e-01 -6.427223e-02
##  [9,] -7.274479e-14 -7.130678e-02 -2.651466e-01 -2.224637e-01 -6.416719e-02
## [10,] -1.105307e-13  5.712574e-02 -4.485693e-01  7.552278e-01 -2.670500e-02
## [11,] -2.243282e-16 -8.584723e-04 -6.428388e-04 -4.943334e-03  7.150675e-04
## [12,]  9.121773e-17  1.910588e-04  3.045746e-04  9.476757e-04 -1.897342e-04
## [13,]  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
##               [,11]        [,12]         [,13]
##  [1,]  9.889747e-01  0.000000000  0.000000e+00
##  [2,] -6.016930e-03 -0.074752973 -4.147413e-02
##  [3,]  9.497431e-02 -0.088977385 -7.617554e-01
##  [4,] -3.885781e-16 -0.149331453  2.538405e-02
##  [5,]  0.000000e+00 -0.001407085 -4.176827e-05
##  [6,] -4.721539e-02 -0.691581337 -1.841802e-01
##  [7,]  0.000000e+00  0.000000000  0.000000e+00
##  [8,] -1.785809e-02  0.474309528 -1.479025e-01
##  [9,] -9.828235e-02  0.276441596 -5.930360e-01
## [10,] -2.550026e-02 -0.429450749 -9.141105e-02
## [11,] -3.611980e-03 -0.009420849 -3.886030e-02
## [12,]  1.311648e-03  0.001492445  2.978830e-03
## [13,]  0.000000e+00  0.000000000  0.000000e+00

Menghitung Correlation Matrix

Correlation matrix digunakan untuk melihat seberapa kuat/erat hubungan antar variabel dalam skala -1 hingga 1. Apabila nilai : 1. mendekati 1 = Variabel sangat berkorelasi positif 2. mendekati -1 = variabel sangat berkolrelasi negatif 3. mendekati 0 = tidak ada hubungan yang kuat antara variabel

cor_matrix <- cor(dataset_numeric)
## Warning in cor(dataset_numeric): the standard deviation is zero
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
##                     Gender        Age       Debt Married BankCustomer
## Gender          1.00000000 -0.1842478 -0.6073564      NA           NA
## Age            -0.18424780  1.0000000  0.2470022      NA           NA
## Debt           -0.60735642  0.2470022  1.0000000      NA           NA
## Married                 NA         NA         NA       1           NA
## BankCustomer            NA         NA         NA      NA            1
## YearsEmployed   0.20990919  0.4232489  0.2237872      NA           NA
## PriorDefault            NA         NA         NA      NA           NA
## Employed        0.66666667  0.6040567 -0.2339515      NA           NA
## CreditScore     0.19011728  0.7599058  0.1869829      NA           NA
## DriversLicense  0.61237244 -0.1679561 -0.1988861      NA           NA
## ZipCode         0.01967376 -0.5887359 -0.6994434      NA           NA
## Income         -0.64609976  0.3452272 -0.1158264      NA           NA
## Approved                NA         NA         NA      NA           NA
##                YearsEmployed PriorDefault   Employed CreditScore DriversLicense
## Gender             0.2099092           NA  0.6666667  0.19011728      0.6123724
## Age                0.4232489           NA  0.6040567  0.75990576     -0.1679561
## Debt               0.2237872           NA -0.2339515  0.18698295     -0.1988861
## Married                   NA           NA         NA          NA             NA
## BankCustomer              NA           NA         NA          NA             NA
## YearsEmployed      1.0000000           NA  0.5415657  0.89711754      0.7712556
## PriorDefault              NA            1         NA          NA             NA
## Employed           0.5415657           NA  1.0000000  0.76046910      0.4082483
## CreditScore        0.8971175           NA  0.7604691  1.00000000      0.5044978
## DriversLicense     0.7712556           NA  0.4082483  0.50449784      1.0000000
## ZipCode           -0.7255806           NA -0.5016809 -0.77424657     -0.2951679
## Income            -0.1009278           NA -0.3152049  0.01028446     -0.3935026
## Approved                  NA           NA         NA          NA             NA
##                    ZipCode      Income Approved
## Gender          0.01967376 -0.64609976       NA
## Age            -0.58873594  0.34522724       NA
## Debt           -0.69944336 -0.11582643       NA
## Married                 NA          NA       NA
## BankCustomer            NA          NA       NA
## YearsEmployed  -0.72558057 -0.10092775       NA
## PriorDefault            NA          NA       NA
## Employed       -0.50168087 -0.31520488       NA
## CreditScore    -0.77424657  0.01028446       NA
## DriversLicense -0.29516787 -0.39350261       NA
## ZipCode         1.00000000  0.33473701       NA
## Income          0.33473701  1.00000000       NA
## Approved                NA          NA        1
cor_melted <- melt(cor_matrix)

ggplot(data = cor_melted, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "red", high = "blue", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(title = "Heatmap Korelasi Antar Variabel", x = "", y = "")