##Upload Library untuk Visualisasi pada Correation Matrix
library(ggplot2)
library(reshape2)
Pada tahap ini, dilakukannya menginput dataset yang berisikan fitur numerik dan non-numerik. Tujuannya adalah dapat dilakukannya analisis pada dataset yang akan dianalisis.
dataset_full <- data.frame(
Gender = c(1, 0, 0, 1, 0),
Age = c(30.83, 58.67, 24.50, 27.83, 20.17),
Debt = c(0.000, 4.460, 0.500, 1.540, 5.625),
Married = c(1, 1, 1, 1, 1),
BankCustomer = c(1, 1, 1, 1, 1),
Industry = c("Industrials", "Materials", "Materials", "Industrials", "Industrials"),
YearsEmployed = c(1.25, 3.04, 1.50, 3.75, 1.71),
PriorDefault = c(1, 1, 1, 1, 1),
Employed = c(1, 1, 0, 1, 0),
CreditScore = c(1, 6, 0, 5, 0),
DriversLicense = c(0, 0, 0, 1, 0),
Citizen = c("ByBirth", "ByBirth", "ByBirth", "ByBirth", "ByOtherMeans"),
ZipCode = c(202, 43, 280, 100, 120),
Income = c(0, 560, 824, 3, 0),
Approved = c(1, 1, 1, 1, 1)
)
print("Dataset:")
## [1] "Dataset:"
print(dataset_full)
## Gender Age Debt Married BankCustomer Industry YearsEmployed
## 1 1 30.83 0.000 1 1 Industrials 1.25
## 2 0 58.67 4.460 1 1 Materials 3.04
## 3 0 24.50 0.500 1 1 Materials 1.50
## 4 1 27.83 1.540 1 1 Industrials 3.75
## 5 0 20.17 5.625 1 1 Industrials 1.71
## PriorDefault Employed CreditScore DriversLicense Citizen ZipCode Income
## 1 1 1 1 0 ByBirth 202 0
## 2 1 1 6 0 ByBirth 43 560
## 3 1 0 0 0 ByBirth 280 824
## 4 1 1 5 1 ByBirth 100 3
## 5 1 0 0 0 ByOtherMeans 120 0
## Approved
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
Setelah dilakukannya menginput dataset, disini diperlukan untuk menyeleksi untuk memilih fitur yang berisi angka numerik karena untuk digunakan dalam perhitungan matematis seperti eigen values, covariance, dan correlation matrix
dataset_numeric <- dataset_full[sapply(dataset_full, is.numeric)]
print("Data setelah hanya berisi fitur numerik:")
## [1] "Data setelah hanya berisi fitur numerik:"
print(dataset_numeric)
## Gender Age Debt Married BankCustomer YearsEmployed PriorDefault Employed
## 1 1 30.83 0.000 1 1 1.25 1 1
## 2 0 58.67 4.460 1 1 3.04 1 1
## 3 0 24.50 0.500 1 1 1.50 1 0
## 4 1 27.83 1.540 1 1 3.75 1 1
## 5 0 20.17 5.625 1 1 1.71 1 0
## CreditScore DriversLicense ZipCode Income Approved
## 1 1 0 202 0 1
## 2 6 0 43 560 1
## 3 0 0 280 824 1
## 4 5 1 100 3 1
## 5 0 0 120 0 1
Eigen values dan eigen vectors digunakan dalam analisis dimensi dan dalam reduksi data, terutama dalam PCA (Principal Component Analysis). Eigen values itu menggambarkan mengenai besarnya informasi yang bisa dijelaskan oleh tiap dimensi dalam dataset. Sedangkan, Eigen vectors itu menunjukan mengenai arah dari dimensi utama dalam dataset. Sebelum dilakukannya menghitung eigen values dan vectors, perlu dilakukannya untuk menghitung variance-covariance matrix. variance-covariance matrix digunakan untuk melihat seberapa besar hubungan antar variabel dalam datasetnya.
cov_matrix <- cov(dataset_numeric)
print("Variance-Covariance Matrix:")
## [1] "Variance-Covariance Matrix:"
print(cov_matrix)
## Gender Age Debt Married BankCustomer
## Gender 0.3000 -1.535000 -0.827500 0 0
## Age -1.5350 231.361400 9.345663 0 0
## Debt -0.8275 9.345663 6.187675 0 0
## Married 0.0000 0.000000 0.000000 0 0
## BankCustomer 0.0000 0.000000 0.000000 0 0
## YearsEmployed 0.1250 6.999375 0.605225 0 0
## PriorDefault 0.0000 0.000000 0.000000 0 0
## Employed 0.2000 5.032500 -0.318750 0 0
## CreditScore 0.3000 33.300000 1.340000 0 0
## DriversLicense 0.1500 -1.142500 -0.221250 0 0
## ZipCode 1.0000 -831.032500 -161.461250 0 0
## Income -137.9500 2046.972500 -112.313750 0 0
## Approved 0.0000 0.000000 0.000000 0 0
## YearsEmployed PriorDefault Employed CreditScore DriversLicense
## Gender 0.125000 0 0.20000 0.30 0.15000
## Age 6.999375 0 5.03250 33.30 -1.14250
## Debt 0.605225 0 -0.31875 1.34 -0.22125
## Married 0.000000 0 0.00000 0.00 0.00000
## BankCustomer 0.000000 0 0.00000 0.00 0.00000
## YearsEmployed 1.182050 0 0.32250 2.81 0.37500
## PriorDefault 0.000000 0 0.00000 0.00 0.00000
## Employed 0.322500 0 0.30000 1.20 0.10000
## CreditScore 2.810000 0 1.20000 8.30 0.65000
## DriversLicense 0.375000 0 0.10000 0.65 0.20000
## ZipCode -73.207500 0 -25.50000 -207.00 -12.25000
## Income -42.775000 0 -67.30000 11.55 -68.60000
## Approved 0.000000 0 0.00000 0.00 0.00000
## ZipCode Income Approved
## Gender 1.0000 -137.9500 0
## Age -831.0325 2046.9725 0
## Debt -161.4613 -112.3137 0
## Married 0.0000 0.0000 0
## BankCustomer 0.0000 0.0000 0
## YearsEmployed -73.2075 -42.7750 0
## PriorDefault 0.0000 0.0000 0
## Employed -25.5000 -67.3000 0
## CreditScore -207.0000 11.5500 0
## DriversLicense -12.2500 -68.6000 0
## ZipCode 8612.0000 12109.2500 0
## Income 12109.2500 151957.8000 0
## Approved 0.0000 0.0000 0
eigen_calculate <- eigen(cov_matrix)
print("Eigen Values:")
## [1] "Eigen Values:"
print(eigen_calculate $values)
## [1] 1.529993e+05 7.738337e+03 7.516727e+01 4.867587e+00 0.000000e+00
## [6] -1.733029e-29 -4.791007e-16 -5.033986e-16 -7.406715e-16 -1.918058e-14
## [11] -2.260795e-14 -2.521282e-13 -1.110098e-12
print("Eigen Vectors:")
## [1] "Eigen Vectors:"
print(eigen_calculate $vectors)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 8.979963e-04 1.599479e-03 2.669568e-02 1.456467e-01 0
## [2,] -1.289705e-02 -1.315601e-01 9.779281e-01 -1.368644e-01 0
## [3,] 8.188070e-04 -1.960531e-02 -1.280374e-01 -6.212193e-01 0
## [4,] 0.000000e+00 -2.775558e-17 -9.992007e-16 2.886580e-15 0
## [5,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [6,] 3.179461e-04 -9.024786e-03 -1.829859e-02 3.240546e-01 0
## [7,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0
## [8,] 4.517914e-04 -2.638175e-03 4.507633e-02 1.130247e-01 0
## [9,] 3.495695e-05 -2.713553e-02 8.404525e-02 6.522530e-01 0
## [10,] 4.535475e-04 -8.272715e-04 -1.323359e-02 1.755846e-01 0
## [11,] -8.349356e-02 9.873753e-01 1.280421e-01 -9.271224e-03 0
## [12,] -9.964239e-01 -8.105261e-02 -2.345634e-02 2.426576e-03 0
## [13,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 1
## [,6] [,7] [,8] [,9] [,10]
## [1,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [2,] 3.173865e-17 2.211582e-04 5.827793e-05 1.467290e-03 1.019239e-02
## [3,] 1.525036e-15 1.845961e-03 5.438426e-03 5.448677e-03 -6.073995e-03
## [4,] 2.172232e-14 1.263052e-02 8.046240e-02 -6.975346e-02 -9.826273e-01
## [5,] -1.310497e-14 -9.786714e-01 1.359410e-01 1.535353e-01 -1.213435e-02
## [6,] 1.504406e-13 5.511389e-02 5.737888e-01 -1.505700e-01 1.587244e-01
## [7,] 1.000000e+00 -4.964085e-14 -2.584517e-13 -3.080869e-15 5.273559e-16
## [8,] 1.685824e-13 1.750799e-01 6.117721e-01 5.735624e-01 -6.427223e-02
## [9,] -7.274479e-14 -7.130678e-02 -2.651466e-01 -2.224637e-01 -6.416719e-02
## [10,] -1.105307e-13 5.712574e-02 -4.485693e-01 7.552278e-01 -2.670500e-02
## [11,] -2.243282e-16 -8.584723e-04 -6.428388e-04 -4.943334e-03 7.150675e-04
## [12,] 9.121773e-17 1.910588e-04 3.045746e-04 9.476757e-04 -1.897342e-04
## [13,] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [,11] [,12] [,13]
## [1,] 9.889747e-01 0.000000000 0.000000e+00
## [2,] -6.016930e-03 -0.074752973 -4.147413e-02
## [3,] 9.497431e-02 -0.088977385 -7.617554e-01
## [4,] -3.885781e-16 -0.149331453 2.538405e-02
## [5,] 0.000000e+00 -0.001407085 -4.176827e-05
## [6,] -4.721539e-02 -0.691581337 -1.841802e-01
## [7,] 0.000000e+00 0.000000000 0.000000e+00
## [8,] -1.785809e-02 0.474309528 -1.479025e-01
## [9,] -9.828235e-02 0.276441596 -5.930360e-01
## [10,] -2.550026e-02 -0.429450749 -9.141105e-02
## [11,] -3.611980e-03 -0.009420849 -3.886030e-02
## [12,] 1.311648e-03 0.001492445 2.978830e-03
## [13,] 0.000000e+00 0.000000000 0.000000e+00
Correlation matrix digunakan untuk melihat seberapa kuat/erat hubungan antar variabel dalam skala -1 hingga 1. Apabila nilai : 1. mendekati 1 = Variabel sangat berkorelasi positif 2. mendekati -1 = variabel sangat berkolrelasi negatif 3. mendekati 0 = tidak ada hubungan yang kuat antara variabel
cor_matrix <- cor(dataset_numeric)
## Warning in cor(dataset_numeric): the standard deviation is zero
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## Gender Age Debt Married BankCustomer
## Gender 1.00000000 -0.1842478 -0.6073564 NA NA
## Age -0.18424780 1.0000000 0.2470022 NA NA
## Debt -0.60735642 0.2470022 1.0000000 NA NA
## Married NA NA NA 1 NA
## BankCustomer NA NA NA NA 1
## YearsEmployed 0.20990919 0.4232489 0.2237872 NA NA
## PriorDefault NA NA NA NA NA
## Employed 0.66666667 0.6040567 -0.2339515 NA NA
## CreditScore 0.19011728 0.7599058 0.1869829 NA NA
## DriversLicense 0.61237244 -0.1679561 -0.1988861 NA NA
## ZipCode 0.01967376 -0.5887359 -0.6994434 NA NA
## Income -0.64609976 0.3452272 -0.1158264 NA NA
## Approved NA NA NA NA NA
## YearsEmployed PriorDefault Employed CreditScore DriversLicense
## Gender 0.2099092 NA 0.6666667 0.19011728 0.6123724
## Age 0.4232489 NA 0.6040567 0.75990576 -0.1679561
## Debt 0.2237872 NA -0.2339515 0.18698295 -0.1988861
## Married NA NA NA NA NA
## BankCustomer NA NA NA NA NA
## YearsEmployed 1.0000000 NA 0.5415657 0.89711754 0.7712556
## PriorDefault NA 1 NA NA NA
## Employed 0.5415657 NA 1.0000000 0.76046910 0.4082483
## CreditScore 0.8971175 NA 0.7604691 1.00000000 0.5044978
## DriversLicense 0.7712556 NA 0.4082483 0.50449784 1.0000000
## ZipCode -0.7255806 NA -0.5016809 -0.77424657 -0.2951679
## Income -0.1009278 NA -0.3152049 0.01028446 -0.3935026
## Approved NA NA NA NA NA
## ZipCode Income Approved
## Gender 0.01967376 -0.64609976 NA
## Age -0.58873594 0.34522724 NA
## Debt -0.69944336 -0.11582643 NA
## Married NA NA NA
## BankCustomer NA NA NA
## YearsEmployed -0.72558057 -0.10092775 NA
## PriorDefault NA NA NA
## Employed -0.50168087 -0.31520488 NA
## CreditScore -0.77424657 0.01028446 NA
## DriversLicense -0.29516787 -0.39350261 NA
## ZipCode 1.00000000 0.33473701 NA
## Income 0.33473701 1.00000000 NA
## Approved NA NA 1
cor_melted <- melt(cor_matrix)
ggplot(data = cor_melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "red", high = "blue", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
labs(title = "Heatmap Korelasi Antar Variabel", x = "", y = "")