data <- read_excel("Dry_Bean_Dataset.xlsx")
head(data)
## # A tibble: 6 × 17
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation Eccentricity
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28395 610. 208. 174. 1.20 0.550
## 2 28734 638. 201. 183. 1.10 0.412
## 3 29380 624. 213. 176. 1.21 0.563
## 4 30008 646. 211. 183. 1.15 0.499
## 5 30140 620. 202. 190. 1.06 0.334
## 6 30279 635. 213. 182. 1.17 0.520
## # ℹ 11 more variables: ConvexArea <dbl>, EquivDiameter <dbl>, Extent <dbl>,
## # Solidity <dbl>, roundness <dbl>, Compactness <dbl>, ShapeFactor1 <dbl>,
## # ShapeFactor2 <dbl>, ShapeFactor3 <dbl>, ShapeFactor4 <dbl>, Class <chr>
dim(data)
## [1] 13611 17
str(data)
## tibble [13,611 × 17] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:13611] 28395 28734 29380 30008 30140 ...
## $ Perimeter : num [1:13611] 610 638 624 646 620 ...
## $ MajorAxisLength: num [1:13611] 208 201 213 211 202 ...
## $ MinorAxisLength: num [1:13611] 174 183 176 183 190 ...
## $ AspectRation : num [1:13611] 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num [1:13611] 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : num [1:13611] 28715 29172 29690 30724 30417 ...
## $ EquivDiameter : num [1:13611] 190 191 193 195 196 ...
## $ Extent : num [1:13611] 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num [1:13611] 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num [1:13611] 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num [1:13611] 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num [1:13611] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num [1:13611] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num [1:13611] 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num [1:13611] 0.999 0.998 0.999 0.994 0.999 ...
## $ Class : chr [1:13611] "SEKER" "SEKER" "SEKER" "SEKER" ...
data_num <- data[, sapply(data, is.numeric)]
str(data_num)
## tibble [13,611 × 16] (S3: tbl_df/tbl/data.frame)
## $ Area : num [1:13611] 28395 28734 29380 30008 30140 ...
## $ Perimeter : num [1:13611] 610 638 624 646 620 ...
## $ MajorAxisLength: num [1:13611] 208 201 213 211 202 ...
## $ MinorAxisLength: num [1:13611] 174 183 176 183 190 ...
## $ AspectRation : num [1:13611] 1.2 1.1 1.21 1.15 1.06 ...
## $ Eccentricity : num [1:13611] 0.55 0.412 0.563 0.499 0.334 ...
## $ ConvexArea : num [1:13611] 28715 29172 29690 30724 30417 ...
## $ EquivDiameter : num [1:13611] 190 191 193 195 196 ...
## $ Extent : num [1:13611] 0.764 0.784 0.778 0.783 0.773 ...
## $ Solidity : num [1:13611] 0.989 0.985 0.99 0.977 0.991 ...
## $ roundness : num [1:13611] 0.958 0.887 0.948 0.904 0.985 ...
## $ Compactness : num [1:13611] 0.913 0.954 0.909 0.928 0.971 ...
## $ ShapeFactor1 : num [1:13611] 0.00733 0.00698 0.00724 0.00702 0.0067 ...
## $ ShapeFactor2 : num [1:13611] 0.00315 0.00356 0.00305 0.00321 0.00366 ...
## $ ShapeFactor3 : num [1:13611] 0.834 0.91 0.826 0.862 0.942 ...
## $ ShapeFactor4 : num [1:13611] 0.999 0.998 0.999 0.994 0.999 ...
colSums(is.na(data))
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 0 0 0 0 0
## Eccentricity ConvexArea EquivDiameter Extent Solidity
## 0 0 0 0 0
## roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
## 0 0 0 0 0
## ShapeFactor4 Class
## 0 0
Berdasarkan hasil pemeriksaan missing value, tidak ditemukan nilai yang hilang pada setiap variabel. Hal ini menunjukkan bahwa dataset telah bersih dan siap untuk dianalisis lebih lanjut tanpa perlu proses imputasi data.
desc <- describe(data_num)
kable(desc[,c("mean","sd","min","max")],
caption = "Tabel 1. Statistik Deskriptif Variabel")
| mean | sd | min | max | |
|---|---|---|---|---|
| Area | 5.304828e+04 | 2.932410e+04 | 2.042000e+04 | 2.546160e+05 |
| Perimeter | 8.552835e+02 | 2.142897e+02 | 5.247360e+02 | 1.985370e+03 |
| MajorAxisLength | 3.201419e+02 | 8.569419e+01 | 1.836012e+02 | 7.388602e+02 |
| MinorAxisLength | 2.022707e+02 | 4.497009e+01 | 1.225127e+02 | 4.601985e+02 |
| AspectRation | 1.583242e+00 | 2.466785e-01 | 1.024868e+00 | 2.430306e+00 |
| Eccentricity | 7.508949e-01 | 9.200180e-02 | 2.189513e-01 | 9.114230e-01 |
| ConvexArea | 5.376820e+04 | 2.977492e+04 | 2.068400e+04 | 2.632610e+05 |
| EquivDiameter | 2.530642e+02 | 5.917712e+01 | 1.612438e+02 | 5.693744e+02 |
| Extent | 7.497328e-01 | 4.908640e-02 | 5.553147e-01 | 8.661946e-01 |
| Solidity | 9.871428e-01 | 4.660400e-03 | 9.192462e-01 | 9.946775e-01 |
| roundness | 8.732818e-01 | 5.951990e-02 | 4.896183e-01 | 9.906854e-01 |
| Compactness | 7.998637e-01 | 6.171350e-02 | 6.405768e-01 | 9.873030e-01 |
| ShapeFactor1 | 6.563600e-03 | 1.128000e-03 | 2.778000e-03 | 1.045120e-02 |
| ShapeFactor2 | 1.715900e-03 | 5.959000e-04 | 5.642000e-04 | 3.665000e-03 |
| ShapeFactor3 | 6.435902e-01 | 9.899620e-02 | 4.103386e-01 | 9.747672e-01 |
| ShapeFactor4 | 9.950633e-01 | 4.366500e-03 | 9.476874e-01 | 9.997325e-01 |
Statistik deskriptif menunjukkan bahwa setiap variabel memiliki rentang nilai dan standar deviasi yang berbeda. Perbedaan skala antar variabel ini menjadi alasan dilakukannya standardisasi sebelum analisis PCA agar setiap variabel memiliki kontribusi yang seimbang.
cor_matrix <- cor(data_num)
biru_pink <- colorRampPalette(c("#3A4F7A", "#FFFFFF", "#E8A0BF"))(200)
corrplot(cor_matrix,
method = "color",
type = "upper",
col = biru_pink,
tl.cex = 0.6)
## Warning in ind1:ind2: numerical expression has 2 elements: only the first used
Matriks korelasi menunjukkan adanya hubungan yang cukup kuat antar beberapa variabel. Hal ini mengindikasikan adanya kemungkinan redundansi informasi, sehingga metode reduksi dimensi seperti PCA dan Factor Analysis relevan untuk diterapkan.
data_scaled <- scale(data_num)
Standardisasi dilakukan untuk menyamakan skala variabel agar setiap variabel memiliki kontribusi yang seimbang dalam analisis PCA dan Factor Analysis. Setelah distandardisasi, setiap variabel memiliki rata-rata 0 dan standar deviasi 1.
# 1.KMO Test
KMO(data_scaled)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = data_scaled)
## Overall MSA = 0.67
## MSA for each item =
## Area Perimeter MajorAxisLength MinorAxisLength AspectRation
## 0.66 0.82 0.68 0.58 0.60
## Eccentricity ConvexArea EquivDiameter Extent Solidity
## 0.70 0.67 0.63 1.00 0.43
## roundness Compactness ShapeFactor1 ShapeFactor2 ShapeFactor3
## 0.74 0.66 0.78 0.85 0.68
## ShapeFactor4
## 0.40
Nilai KMO yang diperoleh lebih dari 0,5 menunjukkan bahwa data memenuhi syarat untuk dilakukan analisis faktor. Artinya, pola korelasi antar variabel cukup memadai untuk membentuk faktor.
# 2. Bartlett Test
cortest.bartlett(data_scaled)
## R was not square, finding R from data
## $chisq
## [1] 1046464
##
## $p.value
## [1] 0
##
## $df
## [1] 120
Hasil Bartlett’s Test menunjukkan nilai signifikansi kurang dari 0,05, sehingga dapat disimpulkan bahwa matriks korelasi tidak berbentuk matriks identitas. Dengan demikian, variabel saling berkorelasi dan layak dianalisis menggunakan analisis faktor.
pca_result <- prcomp(data_scaled, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.9790 2.0564 1.13183 0.90457 0.66203 0.4289 0.33410
## Proportion of Variance 0.5547 0.2643 0.08007 0.05114 0.02739 0.0115 0.00698
## Cumulative Proportion 0.5547 0.8190 0.89904 0.95018 0.97757 0.9891 0.99605
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.22806 0.09089 0.03813 0.03247 0.01715 0.01220 0.003164
## Proportion of Variance 0.00325 0.00052 0.00009 0.00007 0.00002 0.00001 0.000000
## Cumulative Proportion 0.99930 0.99981 0.99991 0.99997 0.99999 1.00000 1.000000
## PC15 PC16
## Standard deviation 0.001465 0.001336
## Proportion of Variance 0.000000 0.000000
## Cumulative Proportion 1.000000 1.000000
Hasil PCA menunjukkan bahwa beberapa komponen utama mampu menjelaskan sebagian besar variasi data. Komponen pertama memiliki kontribusi variasi terbesar dibandingkan komponen lainnya.
fviz_eig(pca_result, barfill = "#3A4F7A", barcolor = "#3A4F7A")
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.
Scree plot menunjukkan adanya penurunan eigenvalue yang signifikan pada beberapa komponen awal. Berdasarkan grafik tersebut, tiga komponen utama dipilih karena mampu merepresentasikan sebagian besar variasi data.
kable(
round(pca_result$rotation[,1:3],3),
caption = "Tabel 2. Loading Tiga Komponen Utama"
)
| PC1 | PC2 | PC3 | |
|---|---|---|---|
| Area | 0.282 | 0.246 | -0.061 |
| Perimeter | 0.311 | 0.179 | -0.019 |
| MajorAxisLength | 0.326 | 0.101 | -0.085 |
| MinorAxisLength | 0.236 | 0.343 | 0.008 |
| AspectRation | 0.229 | -0.331 | -0.169 |
| Eccentricity | 0.232 | -0.319 | -0.163 |
| ConvexArea | 0.283 | 0.245 | -0.054 |
| EquivDiameter | 0.297 | 0.223 | -0.050 |
| Extent | -0.060 | 0.221 | -0.085 |
| Solidity | -0.143 | 0.103 | -0.739 |
| roundness | -0.248 | 0.215 | -0.163 |
| Compactness | -0.238 | 0.329 | 0.150 |
| ShapeFactor1 | -0.221 | -0.333 | -0.033 |
| ShapeFactor2 | -0.315 | 0.129 | 0.120 |
| ShapeFactor3 | -0.239 | 0.328 | 0.150 |
| ShapeFactor4 | -0.198 | 0.100 | -0.537 |
Nilai loading menunjukkan tingkat kontribusi masing-masing variabel terhadap komponen utama. Variabel dengan nilai loading yang tinggi memiliki peran dominan dalam membentuk komponen tersebut.
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#3A4F7A", "#E8A0BF"),
repel = TRUE)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Plot variabel PCA menunjukkan besarnya kontribusi masing-masing variabel terhadap komponen utama. Panjang panah merepresentasikan kekuatan kontribusi, sedangkan arah panah menunjukkan hubungan antar variabel.
fviz_pca_biplot(pca_result,
col.var = "#3A4F7A",
col.ind = "#E8A0BF",
repel = TRUE,
label = "var")
Biplot PCA menggambarkan hubungan antara observasi dan variabel dalam dua dimensi utama. Panjang dan arah panah menunjukkan kekuatan serta arah kontribusi variabel terhadap komponen utama, sedangkan kedekatan antar titik observasi menunjukkan kemiripan karakteristik.
fa.parallel(data_scaled, fa = "fa", fm = "pa")
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
Hasil parallel analysis menunjukkan bahwa jumlah faktor optimal yang dapat dipertahankan adalah tiga faktor, sesuai dengan kriteria eigenvalue dan perbandingan terhadap data acak.
fa_result <- fa(data_scaled, nfactors=3, rotate="varimax", fm = "pa")
## maximum iteration exceeded
## Warning in log(det(m.inv.r)): NaNs produced
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
print(fa_result)
## Factor Analysis using method = pa
## Call: fa(r = data_scaled, nfactors = 3, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 PA3 h2 u2 com
## Area 0.97 -0.14 -0.06 0.96 0.0372 1.0
## Perimeter 0.95 -0.29 -0.13 1.00 0.0011 1.2
## MajorAxisLength 0.88 -0.46 -0.09 1.00 0.0024 1.5
## MinorAxisLength 0.99 0.11 -0.07 1.00 -0.0044 1.0
## AspectRation 0.10 -0.97 -0.05 0.96 0.0359 1.0
## Eccentricity 0.12 -0.95 -0.07 0.92 0.0752 1.0
## ConvexArea 0.97 -0.14 -0.07 0.96 0.0368 1.1
## EquivDiameter 0.98 -0.20 -0.08 1.00 -0.0049 1.1
## Extent 0.11 0.37 0.11 0.16 0.8376 1.4
## Solidity -0.09 0.20 1.23 1.56 -0.5567 1.1
## roundness -0.27 0.72 0.35 0.71 0.2889 1.8
## Compactness -0.12 0.99 0.08 1.01 -0.0059 1.0
## ShapeFactor1 -0.92 -0.12 0.08 0.87 0.1296 1.0
## ShapeFactor2 -0.56 0.80 0.11 0.97 0.0348 1.8
## ShapeFactor3 -0.13 0.99 0.08 1.00 -0.0034 1.0
## ShapeFactor4 -0.28 0.40 0.47 0.46 0.5389 2.6
##
## PA1 PA2 PA3
## SS loadings 6.89 5.72 1.95
## Proportion Var 0.43 0.36 0.12
## Cumulative Var 0.43 0.79 0.91
## Proportion Explained 0.47 0.39 0.13
## Cumulative Proportion 0.47 0.87 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 120 with the objective function = 76.92 with Chi Square = 1046464
## df of the model are 75 and the objective function was NaN
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic n.obs is 13611 with the empirical chi square 362.94 with prob < 2.4e-39
## The total n.obs was 13611 with Likelihood Chi Square = NaN with prob < NaN
##
## Tucker Lewis Index of factoring reliability = NaN
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2 PA3
## Correlation of (regression) scores with factors 1 1 1
## Multiple R square of scores with factors 1 1 1
## Minimum correlation of possible factor scores 1 1 1
Factor Analysis menghasilkan tiga faktor utama setelah dilakukan rotasi varimax. Nilai loading menunjukkan kekuatan hubungan antara variabel dan faktor. Variabel dengan loading tinggi pada faktor yang sama dapat dikelompokkan dalam satu dimensi yang merepresentasikan karakteristik tertentu.
load <- as.data.frame(unclass(fa_result$loadings))
load_round <- round(load, 3)
load_cut <- load_round
load_cut[abs(load_cut) < 0.5] <- ""
kable(load_cut,
caption = "Tabel 3. Factor Loading (>0,5) Setelah Rotasi Varimax")
| PA1 | PA2 | PA3 | |
|---|---|---|---|
| Area | 0.969 | ||
| Perimeter | 0.948 | ||
| MajorAxisLength | 0.884 | ||
| MinorAxisLength | 0.993 | ||
| AspectRation | -0.975 | ||
| Eccentricity | -0.951 | ||
| ConvexArea | 0.969 | ||
| EquivDiameter | 0.979 | ||
| Extent | |||
| Solidity | 1.227 | ||
| roundness | 0.72 | ||
| Compactness | 0.992 | ||
| ShapeFactor1 | -0.922 | ||
| ShapeFactor2 | -0.556 | 0.803 | |
| ShapeFactor3 | 0.991 | ||
| ShapeFactor4 |
Rotasi varimax menghasilkan tiga faktor utama dengan nilai loading dominan di atas 0,5. Hal ini menunjukkan bahwa variabel dapat dikelompokkan ke dalam dimensi tertentu sehingga struktur data menjadi lebih sederhana dan mudah dipahami.