library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(psych) # Analisis faktor & psikometri
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(GPArotation) #Rotasi faktor
## Warning: package 'GPArotation' was built under R version 4.4.3
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
library(factoextra) # Visualisasi multivariat
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr) # Manipulasi data
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Hilangkan kolom spesies karena analisis ini unsupervised
iris_data <- iris[, -5]
# Standarisasi data
iris_scaled <- scale(iris_data)
# Menentukan jumlah faktor (maksimum = jumlah variabel)
pca <- principal(iris_scaled, nfactors = 4, rotate = "none")
print(pca)
## Principal Components Analysis
## Call: principal(r = iris_scaled, nfactors = 4, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 PC3 PC4 h2 u2 com
## Sepal.Length 0.89 0.36 -0.28 -0.04 1 1.1e-16 1.5
## Sepal.Width -0.46 0.88 0.09 0.02 1 4.4e-16 1.5
## Petal.Length 0.99 0.02 0.05 0.12 1 -4.4e-16 1.0
## Petal.Width 0.96 0.06 0.24 -0.08 1 0.0e+00 1.1
##
## PC1 PC2 PC3 PC4
## SS loadings 2.92 0.91 0.15 0.02
## Proportion Var 0.73 0.23 0.04 0.01
## Cumulative Var 0.73 0.96 0.99 1.00
## Proportion Explained 0.73 0.23 0.04 0.01
## Cumulative Proportion 0.73 0.96 0.99 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
## with the empirical chi square 0 with prob < NA
##
## Fit based upon off diagonal values = 1
# Scree Plot untuk melihat jumlah komponen utama yang signifikan
eigen_values <- pca$values
plot(eigen_values, type = "b", main = "Scree Plot",
xlab = "Komponen", ylab = "Eigenvalue",
pch = 16, col = "blue")
abline(h = 1, col = "red", lty = 2, lwd = 2)
text(1:length(eigen_values), eigen_values,
labels = round(eigen_values, 2), pos = 3, cex = 0.8)
legend("topright", legend = "Eigenvalue = 1",
col = "red", lty = 2, lwd = 2)
Interpretasi singkat: Komponen dengan eigenvalue > 1 dianggap
signifikan. Biasanya 2 komponen pertama cukup menjelaskan sebagian besar
variasi data.
wss <- numeric(10)
for (k in 1:10) {
set.seed(42)
km <- kmeans(iris_scaled, centers = k, nstart = 25)
wss[k] <- km$tot.withinss
}
plot(1:10, wss, type = "b", pch = 19, frame = FALSE,
xlab = "Jumlah Cluster (k)",
ylab = "Total Within Sum of Squares",
main = "Metode Elbow untuk Menentukan k Optimal")
set.seed(42)
k <- 3
kmeans_result <- kmeans(iris_scaled, centers = k, nstart = 25)
# Lihat hasil K-Means
kmeans_result
## K-means clustering with 3 clusters of sizes 50, 53, 47
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -1.01119138 0.85041372 -1.3006301 -1.2507035
## 2 -0.05005221 -0.88042696 0.3465767 0.2805873
## 3 1.13217737 0.08812645 0.9928284 1.0141287
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 2 2 3 2 2 2 2 2 2 2 2 3 2 2 2 2 3 2 2 2
## [75] 2 3 3 3 2 2 2 2 2 2 2 3 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3
## [112] 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 3 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3 3 2 3
## [149] 3 2
##
## Within cluster sum of squares by cluster:
## [1] 47.35062 44.08754 47.45019
## (between_SS / total_SS = 76.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Gabungkan hasil cluster dengan data asli
iris_cluster <- cbind(iris, cluster = as.factor(kmeans_result$cluster))
# Perbandingan hasil cluster dengan spesies asli
table(iris_cluster$cluster, iris_cluster$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 39 14
## 3 0 11 36
# Visualisasi dengan dua variabel Petal.Length & Petal.Width
ggplot(iris_cluster, aes(x = Petal.Length, y = Petal.Width,
color = cluster, shape = Species)) +
geom_point(size = 3) +
labs(title = "Hasil K-Means Clustering pada Data Iris",
x = "Petal Length", y = "Petal Width") +
theme_minimal()
# Visualisasi tambahan dengan factoextra
fviz_cluster(kmeans_result, data = iris_scaled,
geom = "point", stand = FALSE,
ellipse = TRUE, show.clust.cent = TRUE) +
labs(title = "Visualisasi Cluster dengan Factoextra")
# Interpretasi - Berdasarkan tabel perbandingan, cluster 1 memuat
sebagian besar Iris setosa. - Cluster 2 dan 3 mencampur versicolor dan
virginica (karena mirip secara morfologi). - Dua variabel petal (panjang
& lebar kelopak) paling berperan dalam pembentukan cluster. - Hasil
ini sesuai dengan karakteristik asli spesies iris.