library(readr)
library(psych)
library(GPArotation)
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Menginstal dan memanggil package yang dibutuhkan untuk analisis PCA dan FA.
df <- read_csv("better-life-index-2024 (2).csv")
## Rows: 38 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (25): GDP per capita (USD), Dwellings without basic facilities, Housing ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(df) <- trimws(colnames(df))
cols <- c(
"GDP per capita (USD)",
"Employment rate",
"Quality of support network",
"Student skills",
"Air pollution",
"Water quality",
"Voter turnout",
"Life expectancy",
"Self-reported health",
"Feeling safe walking alone at night",
"Homicide rate",
"Life satisfaction"
)
df_sel <- df[, cols]
head(df_sel)
## # A tibble: 6 × 12
## `GDP per capita (USD)` `Employment rate` `Quality of support network`
## <dbl> <dbl> <dbl>
## 1 66589 73 93
## 2 59225 72 92
## 3 55536 65 90
## 4 54866 70 93
## 5 16616 56 88
## 6 7327 58 80
## # ℹ 9 more variables: `Student skills` <dbl>, `Air pollution` <dbl>,
## # `Water quality` <dbl>, `Voter turnout` <dbl>, `Life expectancy` <dbl>,
## # `Self-reported health` <dbl>, `Feeling safe walking alone at night` <dbl>,
## # `Homicide rate` <dbl>, `Life satisfaction` <dbl>
Variabel yang dipilih mewakili aspek ekonomi, kesehatan, keamanan, lingkungan, dan kepuasan hidup.
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 1
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
df_sel <- na.omit(df_sel)
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
Memastikan tidak ada data kosong sebelum analisis.
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
df_sel <- na.omit(df_sel)
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
Setelah na.omit(), tidak terdapat missing value. Data siap dianalisis.
describe(df_sel)[,c("mean","sd","min","max")]
## mean sd min max
## GDP per capita (USD) 48520.11 29243.02 7327.0 131384.0
## Employment rate 68.84 7.61 48.0 80.0
## Quality of support network 91.03 5.49 77.0 98.0
## Student skills 486.46 29.32 406.0 526.0
## Air pollution 13.29 6.21 5.5 27.3
## Water quality 84.76 8.96 62.0 98.0
## Voter turnout 69.03 12.68 45.0 92.0
## Life expectancy 80.91 2.56 75.1 84.4
## Self-reported health 68.43 13.26 34.0 89.0
## Feeling safe walking alone at night 73.68 13.12 41.0 93.0
## Homicide rate 2.61 5.73 0.2 26.8
## Life satisfaction 6.69 0.67 4.9 7.9
Memberikan gambaran umum distribusi tiap variabel
Terlihat skala antar variabel berbeda,perlu standarisasi
cor_mat <- cor(df_sel)
corrplot(cor_mat,
method="color",
addCoef.col="black",
tl.cex=0.3)
Terdapat korelasi antar variabel yang berarti indikasi data cocok untuk analisis faktor
boxplot(df_sel,
las=2,
col="lightblue",
main="Boxplot Semua Variabel")
Tidak terlihat outlier ekstrem yang mengganggu analisis.
df_scaled <- scale(df_sel)
colMeans(df_scaled)
## GDP per capita (USD) Employment rate
## 5.991243e-17 -1.091469e-16
## Quality of support network Student skills
## -8.394655e-16 -7.198634e-16
## Air pollution Water quality
## 9.489406e-17 -1.200241e-16
## Voter turnout Life expectancy
## -3.407911e-16 6.042464e-16
## Self-reported health Feeling safe walking alone at night
## -1.982273e-16 -1.249001e-16
## Homicide rate Life satisfaction
## -2.541135e-17 -7.403050e-17
apply(df_scaled,2,sd)
## GDP per capita (USD) Employment rate
## 1 1
## Quality of support network Student skills
## 1 1
## Air pollution Water quality
## 1 1
## Voter turnout Life expectancy
## 1 1
## Self-reported health Feeling safe walking alone at night
## 1 1
## Homicide rate Life satisfaction
## 1 1
Semua variabel sudah distandarisasi dengan benar.
KMO(df_scaled)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_scaled)
## Overall MSA = 0.78
## MSA for each item =
## GDP per capita (USD) Employment rate
## 0.86 0.83
## Quality of support network Student skills
## 0.77 0.84
## Air pollution Water quality
## 0.87 0.77
## Voter turnout Life expectancy
## 0.62 0.66
## Self-reported health Feeling safe walking alone at night
## 0.61 0.82
## Homicide rate Life satisfaction
## 0.67 0.83
Nilai KMO sebesar (0.78) menunjukkan bahwa data termasuk kategori (baik), sehingga layak dilakukan analisis faktor.
cortest.bartlett(cor(df_scaled), n=nrow(df_scaled))
## $chisq
## [1] 308.3601
##
## $p.value
## [1] 5.461402e-33
##
## $df
## [1] 66
Hasil uji Bartlett menunjukkan nilai Chi-square sebesar 308.36 dengan p-value 5.46×10⁻³³ (< 0.05), sehingga H0 ditolak. Hal ini menunjukkan bahwa terdapat korelasi yang signifikan antar variabel dan data layak untuk dilakukan analisis faktor.
pca_full <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
eigenvalues <- (pca_full$sdev)^2
eigenvalues
## [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
## [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324
kaiser_pca <- eigenvalues[eigenvalues > 1]
kaiser_pca
## [1] 6.054399 1.761878 1.257713
Karena ada 3 eigenvalue > 1, maka
mempertahankan 3 komponen utama
plot(eigenvalues,
type="b",
main="Scree Plot PCA",
xlab="Principal Component",
ylab="Eigenvalue")
abline(h=1, col="red")
Berdasarkan scree plot dan kriteria Kaiser (eigenvalue > 1), diperoleh tiga komponen utama yang memiliki eigenvalue lebih dari 1. Selain itu, grafik menunjukkan adanya titik elbow pada komponen ke-3, sehingga dapat disimpulkan bahwa jumlah komponen optimal yang dapat dipertahankan adalah tiga komponen.
loadings_pca <- pca_full$rotation[,1:3]
round(loadings_pca,3)
## PC1 PC2 PC3
## GDP per capita (USD) -0.314 -0.284 -0.096
## Employment rate -0.332 0.201 0.341
## Quality of support network -0.327 0.081 0.075
## Student skills -0.309 0.364 -0.129
## Air pollution 0.314 0.156 -0.240
## Water quality -0.310 -0.005 0.404
## Voter turnout -0.149 -0.277 -0.446
## Life expectancy -0.250 -0.079 -0.446
## Self-reported health -0.119 -0.637 -0.028
## Feeling safe walking alone at night -0.330 0.233 -0.036
## Homicide rate 0.259 -0.324 0.464
## Life satisfaction -0.345 -0.260 0.130
Loading ≥ 0.5 = kontribusi kuat
Bisa digunakan untuk menamai faktor
summary(pca_full)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4606 1.3274 1.1215 0.8577 0.80397 0.64549 0.5713
## Proportion of Variance 0.5045 0.1468 0.1048 0.0613 0.05386 0.03472 0.0272
## Cumulative Proportion 0.5045 0.6514 0.7562 0.8175 0.87133 0.90606 0.9333
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.54717 0.45087 0.33449 0.32392 0.28537
## Proportion of Variance 0.02495 0.01694 0.00932 0.00874 0.00679
## Cumulative Proportion 0.95821 0.97515 0.98447 0.99321 1.00000
sum((pca_full$sdev[1:3])^2) / sum((pca_full$sdev)^2)
## [1] 0.7561659
hasilnya yaitu 75% berarti 3 komponen sudah mewakili 75% informasi data
fviz_pca_biplot(pca_full,
axes=c(1,2),
repel=TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Variabel yang searah → berkorelasi positif
Berlawanan arah → korelasi negatif
Parallel Analysis (FA)
set.seed(123)
fa.parallel(df_scaled, fa="fa", fm="ml")
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
diperoleh bahwa dua faktor pertama memiliki eigenvalue yang lebih besar dibandingkan eigenvalue dari data acak. Dengan demikian, jumlah faktor yang optimal untuk dipertahankan dalam analisis adalah dua faktor.
cor_matrix <- cor(df_scaled)
eigenvalues_fa <- eigen(cor_matrix)$values
eigenvalues_fa
## [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
## [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324
Berdasarkan hasil perhitungan eigenvalue, terdapat tiga faktor dengan nilai lebih dari 1, yaitu 6.054, 1.762, dan 1.257. Faktor Eigen yang lainnya bernilai kurang dari 1.
kaiser_fa <- eigenvalues_fa[eigenvalues_fa > 1]
kaiser_fa
## [1] 6.054399 1.761878 1.257713
length(kaiser_fa)
## [1] 3
Menurut Kaiser Rule terdapat 3 komponen utama yang harus di pertahankan
fa_model <- fa(df_scaled,
nfactors=2,
rotate="varimax",
fm="ml")
print(fa_model$loadings, cutoff=0.3)
##
## Loadings:
## ML2 ML1
## GDP per capita (USD) 0.343 0.709
## Employment rate 0.705 0.454
## Quality of support network 0.630 0.495
## Student skills 0.924
## Air pollution -0.442 -0.619
## Water quality 0.493 0.570
## Voter turnout 0.330
## Life expectancy 0.406 0.385
## Self-reported health 0.646
## Feeling safe walking alone at night 0.758 0.309
## Homicide rate -0.746
## Life satisfaction 0.362 0.894
##
## ML2 ML1
## SS loadings 3.805 3.244
## Proportion Var 0.317 0.270
## Cumulative Var 0.317 0.587
Variabel yang searah → berkorelasi positif
Berlawanan arah → korelasi negatif
fa_model$communality
## GDP per capita (USD) Employment rate
## 0.6200209 0.7024331
## Quality of support network Student skills
## 0.6414562 0.8631492
## Air pollution Water quality
## 0.5791136 0.5675115
## Voter turnout Life expectancy
## 0.1304454 0.3128098
## Self-reported health Feeling safe walking alone at night
## 0.4719106 0.6700756
## Homicide rate Life satisfaction
## 0.5600334 0.9295971
0.5 → baik
< 0.4 → kurang baik
fa_model$Vaccounted
## ML2 ML1
## SS loadings 3.8049944 3.2435620
## Proportion Var 0.3170829 0.2702968
## Cumulative Var 0.3170829 0.5873797
## Proportion Explained 0.5398261 0.4601739
## Cumulative Proportion 0.5398261 1.0000000
Dua faktor yang terbentuk mampu menjelaskan 58.74% variasi data, yang menunjukkan bahwa model sudah cukup baik dalam merepresentasikan struktur data