Load Library

library(readr)
library(psych)
library(GPArotation)
## 
## Attaching package: 'GPArotation'
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin
library(corrplot)
## corrplot 0.95 loaded
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Menginstal dan memanggil package yang dibutuhkan untuk analisis PCA dan FA.

Import Dataset

df <- read_csv("better-life-index-2024 (2).csv")
## Rows: 38 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Country
## dbl (25): GDP per capita (USD), Dwellings without basic facilities, Housing ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(df) <- trimws(colnames(df))
cols <- c(
  "GDP per capita (USD)",
  "Employment rate",
  "Quality of support network",
  "Student skills",
  "Air pollution",
  "Water quality",
  "Voter turnout",
  "Life expectancy",
  "Self-reported health",
  "Feeling safe walking alone at night",
  "Homicide rate",
  "Life satisfaction"
)

df_sel <- df[, cols]
head(df_sel)
## # A tibble: 6 × 12
##   `GDP per capita (USD)` `Employment rate` `Quality of support network`
##                    <dbl>             <dbl>                        <dbl>
## 1                  66589                73                           93
## 2                  59225                72                           92
## 3                  55536                65                           90
## 4                  54866                70                           93
## 5                  16616                56                           88
## 6                   7327                58                           80
## # ℹ 9 more variables: `Student skills` <dbl>, `Air pollution` <dbl>,
## #   `Water quality` <dbl>, `Voter turnout` <dbl>, `Life expectancy` <dbl>,
## #   `Self-reported health` <dbl>, `Feeling safe walking alone at night` <dbl>,
## #   `Homicide rate` <dbl>, `Life satisfaction` <dbl>

Variabel yang dipilih mewakili aspek ekonomi, kesehatan, keamanan, lingkungan, dan kepuasan hidup.

Cek Missing Value

colSums(is.na(df_sel))
##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   1 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0
df_sel <- na.omit(df_sel)

colSums(is.na(df_sel))
##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

Memastikan tidak ada data kosong sebelum analisis.

colSums(is.na(df_sel))
##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0
df_sel <- na.omit(df_sel)

colSums(is.na(df_sel))
##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

Setelah na.omit(), tidak terdapat missing value. Data siap dianalisis.

Statistik Deskriptif

describe(df_sel)[,c("mean","sd","min","max")]
##                                         mean       sd    min      max
## GDP per capita (USD)                48520.11 29243.02 7327.0 131384.0
## Employment rate                        68.84     7.61   48.0     80.0
## Quality of support network             91.03     5.49   77.0     98.0
## Student skills                        486.46    29.32  406.0    526.0
## Air pollution                          13.29     6.21    5.5     27.3
## Water quality                          84.76     8.96   62.0     98.0
## Voter turnout                          69.03    12.68   45.0     92.0
## Life expectancy                        80.91     2.56   75.1     84.4
## Self-reported health                   68.43    13.26   34.0     89.0
## Feeling safe walking alone at night    73.68    13.12   41.0     93.0
## Homicide rate                           2.61     5.73    0.2     26.8
## Life satisfaction                       6.69     0.67    4.9      7.9

Memberikan gambaran umum distribusi tiap variabel

Terlihat skala antar variabel berbeda,perlu standarisasi

Matriks Korelasi

cor_mat <- cor(df_sel)

corrplot(cor_mat,
         method="color",
         addCoef.col="black",
         tl.cex=0.3)

Terdapat korelasi antar variabel yang berarti indikasi data cocok untuk analisis faktor

Boxplot

boxplot(df_sel,
        las=2,
        col="lightblue",
        main="Boxplot Semua Variabel")

Tidak terlihat outlier ekstrem yang mengganggu analisis.

Standarisasi Data

df_scaled <- scale(df_sel)

colMeans(df_scaled)
##                GDP per capita (USD)                     Employment rate 
##                        5.991243e-17                       -1.091469e-16 
##          Quality of support network                      Student skills 
##                       -8.394655e-16                       -7.198634e-16 
##                       Air pollution                       Water quality 
##                        9.489406e-17                       -1.200241e-16 
##                       Voter turnout                     Life expectancy 
##                       -3.407911e-16                        6.042464e-16 
##                Self-reported health Feeling safe walking alone at night 
##                       -1.982273e-16                       -1.249001e-16 
##                       Homicide rate                   Life satisfaction 
##                       -2.541135e-17                       -7.403050e-17
apply(df_scaled,2,sd)
##                GDP per capita (USD)                     Employment rate 
##                                   1                                   1 
##          Quality of support network                      Student skills 
##                                   1                                   1 
##                       Air pollution                       Water quality 
##                                   1                                   1 
##                       Voter turnout                     Life expectancy 
##                                   1                                   1 
##                Self-reported health Feeling safe walking alone at night 
##                                   1                                   1 
##                       Homicide rate                   Life satisfaction 
##                                   1                                   1

Semua variabel sudah distandarisasi dengan benar.

KMO

KMO(df_scaled)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_scaled)
## Overall MSA =  0.78
## MSA for each item = 
##                GDP per capita (USD)                     Employment rate 
##                                0.86                                0.83 
##          Quality of support network                      Student skills 
##                                0.77                                0.84 
##                       Air pollution                       Water quality 
##                                0.87                                0.77 
##                       Voter turnout                     Life expectancy 
##                                0.62                                0.66 
##                Self-reported health Feeling safe walking alone at night 
##                                0.61                                0.82 
##                       Homicide rate                   Life satisfaction 
##                                0.67                                0.83

Nilai KMO sebesar (0.78) menunjukkan bahwa data termasuk kategori (baik), sehingga layak dilakukan analisis faktor.

Barlett Test

cortest.bartlett(cor(df_scaled), n=nrow(df_scaled))
## $chisq
## [1] 308.3601
## 
## $p.value
## [1] 5.461402e-33
## 
## $df
## [1] 66

Hasil uji Bartlett menunjukkan nilai Chi-square sebesar 308.36 dengan p-value 5.46×10⁻³³ (< 0.05), sehingga H0 ditolak. Hal ini menunjukkan bahwa terdapat korelasi yang signifikan antar variabel dan data layak untuk dilakukan analisis faktor.

PCA

pca_full <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
eigenvalues <- (pca_full$sdev)^2
eigenvalues
##  [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
##  [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324

Kaiser Rule (PCA)

kaiser_pca <- eigenvalues[eigenvalues > 1]
kaiser_pca
## [1] 6.054399 1.761878 1.257713

Karena ada 3 eigenvalue > 1, maka

mempertahankan 3 komponen utama

Scree Plot PCA

plot(eigenvalues,
     type="b",
     main="Scree Plot PCA",
     xlab="Principal Component",
     ylab="Eigenvalue")
abline(h=1, col="red")

Berdasarkan scree plot dan kriteria Kaiser (eigenvalue > 1), diperoleh tiga komponen utama yang memiliki eigenvalue lebih dari 1. Selain itu, grafik menunjukkan adanya titik elbow pada komponen ke-3, sehingga dapat disimpulkan bahwa jumlah komponen optimal yang dapat dipertahankan adalah tiga komponen.

loadings_pca <- pca_full$rotation[,1:3]
round(loadings_pca,3)
##                                        PC1    PC2    PC3
## GDP per capita (USD)                -0.314 -0.284 -0.096
## Employment rate                     -0.332  0.201  0.341
## Quality of support network          -0.327  0.081  0.075
## Student skills                      -0.309  0.364 -0.129
## Air pollution                        0.314  0.156 -0.240
## Water quality                       -0.310 -0.005  0.404
## Voter turnout                       -0.149 -0.277 -0.446
## Life expectancy                     -0.250 -0.079 -0.446
## Self-reported health                -0.119 -0.637 -0.028
## Feeling safe walking alone at night -0.330  0.233 -0.036
## Homicide rate                        0.259 -0.324  0.464
## Life satisfaction                   -0.345 -0.260  0.130

Loading ≥ 0.5 = kontribusi kuat

Bisa digunakan untuk menamai faktor

summary(pca_full)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     2.4606 1.3274 1.1215 0.8577 0.80397 0.64549 0.5713
## Proportion of Variance 0.5045 0.1468 0.1048 0.0613 0.05386 0.03472 0.0272
## Cumulative Proportion  0.5045 0.6514 0.7562 0.8175 0.87133 0.90606 0.9333
##                            PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.54717 0.45087 0.33449 0.32392 0.28537
## Proportion of Variance 0.02495 0.01694 0.00932 0.00874 0.00679
## Cumulative Proportion  0.95821 0.97515 0.98447 0.99321 1.00000

Cumulative Variance

sum((pca_full$sdev[1:3])^2) / sum((pca_full$sdev)^2)
## [1] 0.7561659

hasilnya yaitu 75% berarti 3 komponen sudah mewakili 75% informasi data

Biplot PCA

fviz_pca_biplot(pca_full,
                axes=c(1,2),
                repel=TRUE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Variabel yang searah → berkorelasi positif

Berlawanan arah → korelasi negatif

FA

Parallel Analysis (FA)

set.seed(123)
fa.parallel(df_scaled, fa="fa", fm="ml")

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

diperoleh bahwa dua faktor pertama memiliki eigenvalue yang lebih besar dibandingkan eigenvalue dari data acak. Dengan demikian, jumlah faktor yang optimal untuk dipertahankan dalam analisis adalah dua faktor.

Eigen value

cor_matrix <- cor(df_scaled)
eigenvalues_fa <- eigen(cor_matrix)$values
eigenvalues_fa
##  [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
##  [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324

Berdasarkan hasil perhitungan eigenvalue, terdapat tiga faktor dengan nilai lebih dari 1, yaitu 6.054, 1.762, dan 1.257. Faktor Eigen yang lainnya bernilai kurang dari 1.

Kaiser Rule (FA)

kaiser_fa <- eigenvalues_fa[eigenvalues_fa > 1]
kaiser_fa
## [1] 6.054399 1.761878 1.257713
length(kaiser_fa)
## [1] 3

Menurut Kaiser Rule terdapat 3 komponen utama yang harus di pertahankan

Biplot FA

fa_model <- fa(df_scaled,
               nfactors=2,
               rotate="varimax",
               fm="ml")

print(fa_model$loadings, cutoff=0.3)
## 
## Loadings:
##                                     ML2    ML1   
## GDP per capita (USD)                 0.343  0.709
## Employment rate                      0.705  0.454
## Quality of support network           0.630  0.495
## Student skills                       0.924       
## Air pollution                       -0.442 -0.619
## Water quality                        0.493  0.570
## Voter turnout                               0.330
## Life expectancy                      0.406  0.385
## Self-reported health                        0.646
## Feeling safe walking alone at night  0.758  0.309
## Homicide rate                       -0.746       
## Life satisfaction                    0.362  0.894
## 
##                  ML2   ML1
## SS loadings    3.805 3.244
## Proportion Var 0.317 0.270
## Cumulative Var 0.317 0.587

Variabel yang searah → berkorelasi positif

Berlawanan arah → korelasi negatif

Communality (Komunalitas)

fa_model$communality
##                GDP per capita (USD)                     Employment rate 
##                           0.6200209                           0.7024331 
##          Quality of support network                      Student skills 
##                           0.6414562                           0.8631492 
##                       Air pollution                       Water quality 
##                           0.5791136                           0.5675115 
##                       Voter turnout                     Life expectancy 
##                           0.1304454                           0.3128098 
##                Self-reported health Feeling safe walking alone at night 
##                           0.4719106                           0.6700756 
##                       Homicide rate                   Life satisfaction 
##                           0.5600334                           0.9295971

0.5 → baik

< 0.4 → kurang baik

Variance Accounted

fa_model$Vaccounted
##                             ML2       ML1
## SS loadings           3.8049944 3.2435620
## Proportion Var        0.3170829 0.2702968
## Cumulative Var        0.3170829 0.5873797
## Proportion Explained  0.5398261 0.4601739
## Cumulative Proportion 0.5398261 1.0000000

Dua faktor yang terbentuk mampu menjelaskan 58.74% variasi data, yang menunjukkan bahwa model sudah cukup baik dalam merepresentasikan struktur data