library(readr)
library(psych)
library(GPArotation)
##
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
##
## equamax, varimin
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Menginstal dan memanggil package yang dibutuhkan untuk analisis PCA dan FA.
df <- read_csv("better-life-index-2024 (2).csv")
## Rows: 38 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Country
## dbl (25): GDP per capita (USD), Dwellings without basic facilities, Housing ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(df) <- trimws(colnames(df))
cols <- c(
"GDP per capita (USD)",
"Employment rate",
"Quality of support network",
"Student skills",
"Air pollution",
"Water quality",
"Voter turnout",
"Life expectancy",
"Self-reported health",
"Feeling safe walking alone at night",
"Homicide rate",
"Life satisfaction"
)
df_sel <- df[, cols]
head(df_sel)
## # A tibble: 6 × 12
## `GDP per capita (USD)` `Employment rate` `Quality of support network`
## <dbl> <dbl> <dbl>
## 1 66589 73 93
## 2 59225 72 92
## 3 55536 65 90
## 4 54866 70 93
## 5 16616 56 88
## 6 7327 58 80
## # ℹ 9 more variables: `Student skills` <dbl>, `Air pollution` <dbl>,
## # `Water quality` <dbl>, `Voter turnout` <dbl>, `Life expectancy` <dbl>,
## # `Self-reported health` <dbl>, `Feeling safe walking alone at night` <dbl>,
## # `Homicide rate` <dbl>, `Life satisfaction` <dbl>
Variabel yang dipilih mewakili aspek ekonomi, kesehatan, keamanan, lingkungan, dan kepuasan hidup.
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 1
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
df_sel <- na.omit(df_sel)
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
Memastikan tidak ada data kosong sebelum analisis.
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
df_sel <- na.omit(df_sel)
colSums(is.na(df_sel))
## GDP per capita (USD) Employment rate
## 0 0
## Quality of support network Student skills
## 0 0
## Air pollution Water quality
## 0 0
## Voter turnout Life expectancy
## 0 0
## Self-reported health Feeling safe walking alone at night
## 0 0
## Homicide rate Life satisfaction
## 0 0
Setelah na.omit(), tidak terdapat missing value. Data siap dianalisis.
describe(df_sel)[,c("mean","sd","min","max")]
## mean sd min max
## GDP per capita (USD) 48520.11 29243.02 7327.0 131384.0
## Employment rate 68.84 7.61 48.0 80.0
## Quality of support network 91.03 5.49 77.0 98.0
## Student skills 486.46 29.32 406.0 526.0
## Air pollution 13.29 6.21 5.5 27.3
## Water quality 84.76 8.96 62.0 98.0
## Voter turnout 69.03 12.68 45.0 92.0
## Life expectancy 80.91 2.56 75.1 84.4
## Self-reported health 68.43 13.26 34.0 89.0
## Feeling safe walking alone at night 73.68 13.12 41.0 93.0
## Homicide rate 2.61 5.73 0.2 26.8
## Life satisfaction 6.69 0.67 4.9 7.9
Memberikan gambaran umum distribusi tiap variabel
Terlihat skala antar variabel berbeda,perlu standarisasi
cor_mat <- cor(df_sel)
corrplot(cor_mat,
method="color",
addCoef.col="black",
tl.cex=0.3)
Terdapat korelasi antar variabel yang berarti indikasi data cocok untuk analisis faktor
boxplot(df_sel,
las=2,
col="lightblue",
main="Boxplot Semua Variabel")
Tidak terlihat outlier ekstrem yang mengganggu analisis.
df_scaled <- scale(df_sel)
colMeans(df_scaled)
## GDP per capita (USD) Employment rate
## 5.991243e-17 -1.091469e-16
## Quality of support network Student skills
## -8.394655e-16 -7.198634e-16
## Air pollution Water quality
## 9.489406e-17 -1.200241e-16
## Voter turnout Life expectancy
## -3.407911e-16 6.042464e-16
## Self-reported health Feeling safe walking alone at night
## -1.982273e-16 -1.249001e-16
## Homicide rate Life satisfaction
## -2.541135e-17 -7.403050e-17
apply(df_scaled,2,sd)
## GDP per capita (USD) Employment rate
## 1 1
## Quality of support network Student skills
## 1 1
## Air pollution Water quality
## 1 1
## Voter turnout Life expectancy
## 1 1
## Self-reported health Feeling safe walking alone at night
## 1 1
## Homicide rate Life satisfaction
## 1 1
Semua variabel sudah distandarisasi dengan benar.
KMO(df_scaled)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_scaled)
## Overall MSA = 0.78
## MSA for each item =
## GDP per capita (USD) Employment rate
## 0.86 0.83
## Quality of support network Student skills
## 0.77 0.84
## Air pollution Water quality
## 0.87 0.77
## Voter turnout Life expectancy
## 0.62 0.66
## Self-reported health Feeling safe walking alone at night
## 0.61 0.82
## Homicide rate Life satisfaction
## 0.67 0.83
Nilai KMO sebesar (0.78) menunjukkan bahwa data termasuk kategori (baik), sehingga layak dilakukan analisis faktor.
cortest.bartlett(cor(df_scaled), n=nrow(df_scaled))
## $chisq
## [1] 308.3601
##
## $p.value
## [1] 5.461402e-33
##
## $df
## [1] 66
Hasil uji Bartlett menunjukkan nilai Chi-square sebesar 308.36 dengan p-value 5.46×10⁻³³ (< 0.05), sehingga H0 ditolak. Hal ini menunjukkan bahwa terdapat korelasi yang signifikan antar variabel dan data layak untuk dilakukan analisis faktor.
pca_full <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
eigenvalues <- (pca_full$sdev)^2
eigenvalues
## [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
## [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324
plot(eigenvalues,
type="b",
main="Scree Plot PCA",
xlab="Principal Component",
ylab="Eigenvalue")
abline(h=1, col="red")
Berdasarkan scree plot dan kriteria Kaiser (eigenvalue > 1), diperoleh tiga komponen utama yang memiliki eigenvalue lebih dari 1. Selain itu, grafik menunjukkan adanya titik elbow pada komponen ke-3, sehingga dapat disimpulkan bahwa jumlah komponen optimal yang dapat dipertahankan adalah tiga komponen.
loadings_pca <- pca_full$rotation[,1:3]
round(loadings_pca,3)
## PC1 PC2 PC3
## GDP per capita (USD) -0.314 -0.284 -0.096
## Employment rate -0.332 0.201 0.341
## Quality of support network -0.327 0.081 0.075
## Student skills -0.309 0.364 -0.129
## Air pollution 0.314 0.156 -0.240
## Water quality -0.310 -0.005 0.404
## Voter turnout -0.149 -0.277 -0.446
## Life expectancy -0.250 -0.079 -0.446
## Self-reported health -0.119 -0.637 -0.028
## Feeling safe walking alone at night -0.330 0.233 -0.036
## Homicide rate 0.259 -0.324 0.464
## Life satisfaction -0.345 -0.260 0.130
Loading ≥ 0.5 = kontribusi kuat
Bisa digunakan untuk menamai faktor
summary(pca_full)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4606 1.3274 1.1215 0.8577 0.80397 0.64549 0.5713
## Proportion of Variance 0.5045 0.1468 0.1048 0.0613 0.05386 0.03472 0.0272
## Cumulative Proportion 0.5045 0.6514 0.7562 0.8175 0.87133 0.90606 0.9333
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.54717 0.45087 0.33449 0.32392 0.28537
## Proportion of Variance 0.02495 0.01694 0.00932 0.00874 0.00679
## Cumulative Proportion 0.95821 0.97515 0.98447 0.99321 1.00000
Proporsi Variansi
sum((pca_full$sdev[1:3])^2) / sum((pca_full$sdev)^2)
## [1] 0.7561659
hasilnya yaitu 70% berarti 3 komponen sudah mewakili 70% informasi data
fviz_pca_biplot(pca_full,
axes=c(1,2),
repel=TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Variabel yang searah → berkorelasi positif
Berlawanan arah → korelasi negatif
fa_model <- fa(df_scaled,
nfactors=3,
rotate="varimax",
fm="ml")
print(fa_model$loadings, cutoff=0.3)
##
## Loadings:
## ML2 ML1 ML3
## GDP per capita (USD) 0.443 0.660
## Employment rate 0.912 0.314
## Quality of support network 0.569 0.514
## Student skills 0.536 0.668
## Air pollution -0.634 -0.410
## Water quality 0.844
## Voter turnout 0.437
## Life expectancy 0.487 0.461
## Self-reported health 0.786
## Feeling safe walking alone at night 0.562 0.551
## Homicide rate -0.984
## Life satisfaction 0.672 0.626
##
## ML2 ML1 ML3
## SS loadings 3.567 2.524 2.166
## Proportion Var 0.297 0.210 0.180
## Cumulative Var 0.297 0.508 0.688
fa_model <- fa(df_scaled,
nfactors=3,
rotate="varimax",
fm="ml")
print(fa_model$loadings, cutoff=0.3)
##
## Loadings:
## ML2 ML1 ML3
## GDP per capita (USD) 0.443 0.660
## Employment rate 0.912 0.314
## Quality of support network 0.569 0.514
## Student skills 0.536 0.668
## Air pollution -0.634 -0.410
## Water quality 0.844
## Voter turnout 0.437
## Life expectancy 0.487 0.461
## Self-reported health 0.786
## Feeling safe walking alone at night 0.562 0.551
## Homicide rate -0.984
## Life satisfaction 0.672 0.626
##
## ML2 ML1 ML3
## SS loadings 3.567 2.524 2.166
## Proportion Var 0.297 0.210 0.180
## Cumulative Var 0.297 0.508 0.688
Variabel yang searah → berkorelasi positif
Berlawanan arah → korelasi negatif
fa_model$communality
## GDP per capita (USD) Employment rate
## 0.6879319 0.9296813
## Quality of support network Student skills
## 0.6690915 0.7352641
## Air pollution Water quality
## 0.6094717 0.7501573
## Voter turnout Life expectancy
## 0.2324836 0.4786077
## Self-reported health Feeling safe walking alone at night
## 0.6519042 0.6402054
## Homicide rate Life satisfaction
## 0.9950024 0.8769444
0.5 → baik
< 0.4 → kurang baik
fa.parallel(df_scaled, fa="fa")
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
Bandingkan eigenvalue aktual dengan acak.
fa_model$Vaccounted
## ML2 ML1 ML3
## SS loadings 3.5665271 2.5244145 2.1658039
## Proportion Var 0.2972106 0.2103679 0.1804837
## Cumulative Var 0.2972106 0.5075785 0.6880621
## Proportion Explained 0.4319531 0.3057396 0.2623072
## Cumulative Proportion 0.4319531 0.7376928 1.0000000
Hasil analisis faktor menunjukkan terdapat 4 faktor yang memiliki eigenvalue > 1 1. Faktor pertama menjelaskan variasi sebesar 28.47%. 2. Faktor kedua 19.30%. 3. Faktor ketiga 15.88%. 4. Faktor keempat 10.23%. Secara kumulatif, keempat faktor tersebut mampu menjelaskan sebesar 73.87% variasi data. Hal ini menunjukkan bahwa model faktor yang terbentuk cukup baik dalam merepresentasikan struktur data.