Load Library

library(readr)
library(psych)
library(GPArotation)

## 
## Attaching package: 'GPArotation'

## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

library(corrplot)

## corrplot 0.95 loaded

library(factoextra)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Menginstal dan memanggil package yang dibutuhkan untuk analisis PCA dan FA.

Import Dataset

df <- read_csv("better-life-index-2024 (2).csv")

## Rows: 38 Columns: 26
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Country
## dbl (25): GDP per capita (USD), Dwellings without basic facilities, Housing ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

colnames(df) <- trimws(colnames(df))

cols <- c(
  "GDP per capita (USD)",
  "Employment rate",
  "Quality of support network",
  "Student skills",
  "Air pollution",
  "Water quality",
  "Voter turnout",
  "Life expectancy",
  "Self-reported health",
  "Feeling safe walking alone at night",
  "Homicide rate",
  "Life satisfaction"
)

df_sel <- df[, cols]
head(df_sel)

## # A tibble: 6 × 12
##   `GDP per capita (USD)` `Employment rate` `Quality of support network`
##                    <dbl>             <dbl>                        <dbl>
## 1                  66589                73                           93
## 2                  59225                72                           92
## 3                  55536                65                           90
## 4                  54866                70                           93
## 5                  16616                56                           88
## 6                   7327                58                           80
## # ℹ 9 more variables: `Student skills` <dbl>, `Air pollution` <dbl>,
## #   `Water quality` <dbl>, `Voter turnout` <dbl>, `Life expectancy` <dbl>,
## #   `Self-reported health` <dbl>, `Feeling safe walking alone at night` <dbl>,
## #   `Homicide rate` <dbl>, `Life satisfaction` <dbl>

Variabel yang dipilih mewakili aspek ekonomi, kesehatan, keamanan, lingkungan, dan kepuasan hidup.

Cek Missing Value

colSums(is.na(df_sel))

##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   1 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

df_sel <- na.omit(df_sel)

colSums(is.na(df_sel))

##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

Memastikan tidak ada data kosong sebelum analisis.

colSums(is.na(df_sel))

##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

df_sel <- na.omit(df_sel)

colSums(is.na(df_sel))

##                GDP per capita (USD)                     Employment rate 
##                                   0                                   0 
##          Quality of support network                      Student skills 
##                                   0                                   0 
##                       Air pollution                       Water quality 
##                                   0                                   0 
##                       Voter turnout                     Life expectancy 
##                                   0                                   0 
##                Self-reported health Feeling safe walking alone at night 
##                                   0                                   0 
##                       Homicide rate                   Life satisfaction 
##                                   0                                   0

Setelah na.omit(), tidak terdapat missing value. Data siap dianalisis.

Statistik Deskriptif

describe(df_sel)[,c("mean","sd","min","max")]

##                                         mean       sd    min      max
## GDP per capita (USD)                48520.11 29243.02 7327.0 131384.0
## Employment rate                        68.84     7.61   48.0     80.0
## Quality of support network             91.03     5.49   77.0     98.0
## Student skills                        486.46    29.32  406.0    526.0
## Air pollution                          13.29     6.21    5.5     27.3
## Water quality                          84.76     8.96   62.0     98.0
## Voter turnout                          69.03    12.68   45.0     92.0
## Life expectancy                        80.91     2.56   75.1     84.4
## Self-reported health                   68.43    13.26   34.0     89.0
## Feeling safe walking alone at night    73.68    13.12   41.0     93.0
## Homicide rate                           2.61     5.73    0.2     26.8
## Life satisfaction                       6.69     0.67    4.9      7.9

Memberikan gambaran umum distribusi tiap variabel

Terlihat skala antar variabel berbeda,perlu standarisasi

Matriks Korelasi

cor_mat <- cor(df_sel)

corrplot(cor_mat,
         method="color",
         addCoef.col="black",
         tl.cex=0.3)

Terdapat korelasi antar variabel yang berarti indikasi data cocok untuk analisis faktor

Boxplot

boxplot(df_sel,
        las=2,
        col="lightblue",
        main="Boxplot Semua Variabel")

Tidak terlihat outlier ekstrem yang mengganggu analisis.

Standarisasi Data

df_scaled <- scale(df_sel)

colMeans(df_scaled)

##                GDP per capita (USD)                     Employment rate 
##                        5.991243e-17                       -1.091469e-16 
##          Quality of support network                      Student skills 
##                       -8.394655e-16                       -7.198634e-16 
##                       Air pollution                       Water quality 
##                        9.489406e-17                       -1.200241e-16 
##                       Voter turnout                     Life expectancy 
##                       -3.407911e-16                        6.042464e-16 
##                Self-reported health Feeling safe walking alone at night 
##                       -1.982273e-16                       -1.249001e-16 
##                       Homicide rate                   Life satisfaction 
##                       -2.541135e-17                       -7.403050e-17

apply(df_scaled,2,sd)

##                GDP per capita (USD)                     Employment rate 
##                                   1                                   1 
##          Quality of support network                      Student skills 
##                                   1                                   1 
##                       Air pollution                       Water quality 
##                                   1                                   1 
##                       Voter turnout                     Life expectancy 
##                                   1                                   1 
##                Self-reported health Feeling safe walking alone at night 
##                                   1                                   1 
##                       Homicide rate                   Life satisfaction 
##                                   1                                   1

Semua variabel sudah distandarisasi dengan benar.

KMO

KMO(df_scaled)

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = df_scaled)
## Overall MSA =  0.78
## MSA for each item = 
##                GDP per capita (USD)                     Employment rate 
##                                0.86                                0.83 
##          Quality of support network                      Student skills 
##                                0.77                                0.84 
##                       Air pollution                       Water quality 
##                                0.87                                0.77 
##                       Voter turnout                     Life expectancy 
##                                0.62                                0.66 
##                Self-reported health Feeling safe walking alone at night 
##                                0.61                                0.82 
##                       Homicide rate                   Life satisfaction 
##                                0.67                                0.83

Nilai KMO sebesar (0.78) menunjukkan bahwa data termasuk kategori (baik), sehingga layak dilakukan analisis faktor.

Barlett Test

cortest.bartlett(cor(df_scaled), n=nrow(df_scaled))

## $chisq
## [1] 308.3601
## 
## $p.value
## [1] 5.461402e-33
## 
## $df
## [1] 66

Hasil uji Bartlett menunjukkan nilai Chi-square sebesar 308.36 dengan p-value 5.46×10⁻³³ (< 0.05), sehingga H0 ditolak. Hal ini menunjukkan bahwa terdapat korelasi yang signifikan antar variabel dan data layak untuk dilakukan analisis faktor.

PCA

pca_full <- prcomp(df_scaled, center=FALSE, scale.=FALSE)
eigenvalues <- (pca_full$sdev)^2
eigenvalues

##  [1] 6.05439907 1.76187846 1.25771323 0.73564041 0.64637102 0.41666106
##  [7] 0.32640916 0.29940033 0.20328328 0.11188434 0.10492639 0.08143324

plot(eigenvalues,
     type="b",
     main="Scree Plot PCA",
     xlab="Principal Component",
     ylab="Eigenvalue")
abline(h=1, col="red")

Berdasarkan scree plot dan kriteria Kaiser (eigenvalue > 1), diperoleh tiga komponen utama yang memiliki eigenvalue lebih dari 1. Selain itu, grafik menunjukkan adanya titik elbow pada komponen ke-3, sehingga dapat disimpulkan bahwa jumlah komponen optimal yang dapat dipertahankan adalah tiga komponen.

loadings_pca <- pca_full$rotation[,1:3]
round(loadings_pca,3)

##                                        PC1    PC2    PC3
## GDP per capita (USD)                -0.314 -0.284 -0.096
## Employment rate                     -0.332  0.201  0.341
## Quality of support network          -0.327  0.081  0.075
## Student skills                      -0.309  0.364 -0.129
## Air pollution                        0.314  0.156 -0.240
## Water quality                       -0.310 -0.005  0.404
## Voter turnout                       -0.149 -0.277 -0.446
## Life expectancy                     -0.250 -0.079 -0.446
## Self-reported health                -0.119 -0.637 -0.028
## Feeling safe walking alone at night -0.330  0.233 -0.036
## Homicide rate                        0.259 -0.324  0.464
## Life satisfaction                   -0.345 -0.260  0.130

Loading ≥ 0.5 = kontribusi kuat

Bisa digunakan untuk menamai faktor

summary(pca_full)

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6    PC7
## Standard deviation     2.4606 1.3274 1.1215 0.8577 0.80397 0.64549 0.5713
## Proportion of Variance 0.5045 0.1468 0.1048 0.0613 0.05386 0.03472 0.0272
## Cumulative Proportion  0.5045 0.6514 0.7562 0.8175 0.87133 0.90606 0.9333
##                            PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.54717 0.45087 0.33449 0.32392 0.28537
## Proportion of Variance 0.02495 0.01694 0.00932 0.00874 0.00679
## Cumulative Proportion  0.95821 0.97515 0.98447 0.99321 1.00000

Proporsi Variansi

sum((pca_full$sdev[1:3])^2) / sum((pca_full$sdev)^2)

## [1] 0.7561659

hasilnya yaitu 70% berarti 3 komponen sudah mewakili 70% informasi data

Biplot PCA

fviz_pca_biplot(pca_full,
                axes=c(1,2),
                repel=TRUE)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
##   Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Variabel yang searah → berkorelasi positif

Berlawanan arah → korelasi negatif

FA

fa_model <- fa(df_scaled,
               nfactors=3,
               rotate="varimax",
               fm="ml")

print(fa_model$loadings, cutoff=0.3)

## 
## Loadings:
##                                     ML2    ML1    ML3   
## GDP per capita (USD)                 0.443         0.660
## Employment rate                      0.912  0.314       
## Quality of support network           0.569  0.514       
## Student skills                       0.536  0.668       
## Air pollution                       -0.634        -0.410
## Water quality                        0.844              
## Voter turnout                                      0.437
## Life expectancy                             0.487  0.461
## Self-reported health                               0.786
## Feeling safe walking alone at night  0.562  0.551       
## Homicide rate                              -0.984       
## Life satisfaction                    0.672         0.626
## 
##                  ML2   ML1   ML3
## SS loadings    3.567 2.524 2.166
## Proportion Var 0.297 0.210 0.180
## Cumulative Var 0.297 0.508 0.688

fa_model <- fa(df_scaled,
               nfactors=3,
               rotate="varimax",
               fm="ml")

print(fa_model$loadings, cutoff=0.3)

## 
## Loadings:
##                                     ML2    ML1    ML3   
## GDP per capita (USD)                 0.443         0.660
## Employment rate                      0.912  0.314       
## Quality of support network           0.569  0.514       
## Student skills                       0.536  0.668       
## Air pollution                       -0.634        -0.410
## Water quality                        0.844              
## Voter turnout                                      0.437
## Life expectancy                             0.487  0.461
## Self-reported health                               0.786
## Feeling safe walking alone at night  0.562  0.551       
## Homicide rate                              -0.984       
## Life satisfaction                    0.672         0.626
## 
##                  ML2   ML1   ML3
## SS loadings    3.567 2.524 2.166
## Proportion Var 0.297 0.210 0.180
## Cumulative Var 0.297 0.508 0.688

Variabel yang searah → berkorelasi positif

Berlawanan arah → korelasi negatif

fa_model$communality

##                GDP per capita (USD)                     Employment rate 
##                           0.6879319                           0.9296813 
##          Quality of support network                      Student skills 
##                           0.6690915                           0.7352641 
##                       Air pollution                       Water quality 
##                           0.6094717                           0.7501573 
##                       Voter turnout                     Life expectancy 
##                           0.2324836                           0.4786077 
##                Self-reported health Feeling safe walking alone at night 
##                           0.6519042                           0.6402054 
##                       Homicide rate                   Life satisfaction 
##                           0.9950024                           0.8769444

0.5 → baik

< 0.4 → kurang baik

Parallel Analysis (FA)

fa.parallel(df_scaled, fa="fa")

## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected.  Examine the results carefully

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

Bandingkan eigenvalue aktual dengan acak.

Variance Accounted

fa_model$Vaccounted

##                             ML2       ML1       ML3
## SS loadings           3.5665271 2.5244145 2.1658039
## Proportion Var        0.2972106 0.2103679 0.1804837
## Cumulative Var        0.2972106 0.5075785 0.6880621
## Proportion Explained  0.4319531 0.3057396 0.2623072
## Cumulative Proportion 0.4319531 0.7376928 1.0000000

Hasil analisis faktor menunjukkan terdapat 4 faktor yang memiliki eigenvalue > 1 1. Faktor pertama menjelaskan variasi sebesar 28.47%. 2. Faktor kedua 19.30%. 3. Faktor ketiga 15.88%. 4. Faktor keempat 10.23%. Secara kumulatif, keempat faktor tersebut mampu menjelaskan sebesar 73.87% variasi data. Hal ini menunjukkan bahwa model faktor yang terbentuk cukup baik dalam merepresentasikan struktur data.

PCA-FA (Anmul)